lindera_fst/map.rs
1use std::fmt;
2use std::io;
3use std::iter::FromIterator;
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7pub use crate::raw::IndexedValue;
8use crate::stream::{IntoStreamer, Streamer};
9use crate::Result;
10use std::ops::Deref;
11
12/// Map is a lexicographically ordered map from byte strings to integers.
13///
14/// A `Map` is constructed with the `MapBuilder` type. Alternatively, a `Map`
15/// can be constructed in memory from a lexicographically ordered iterator
16/// of key-value pairs (`Map::from_iter`).
17///
18/// A key feature of `Map` is that it can be serialized to disk compactly. Its
19/// underlying representation is built such that the `Map` can be memory mapped
20/// (`Map::from_path`) and searched without necessarily loading the entire
21/// map into memory.
22///
23/// It supports most common operations associated with maps, such as key
24/// lookup and search. It also supports set operations on its keys along with
25/// the ability to specify how conflicting values are merged together. Maps
26/// also support range queries and automata based searches (e.g. a regular
27/// expression).
28///
29/// Maps are represented by a finite state transducer where inputs are the keys
30/// and outputs are the values. As such, maps have the following invariants:
31///
32/// 1. Once constructed, a `Map` can never be modified.
33/// 2. Maps must be constructed with lexicographically ordered byte sequences.
34/// There is no restricting on the ordering of values.
35///
36/// # Differences with sets
37///
38/// Maps and sets are represented by the same underlying data structure: the
39/// finite state transducer. The principal difference between them is that
40/// sets always have their output values set to `0`. This has an impact on the
41/// representation size and is reflected in the type system for convenience.
42/// A secondary but subtle difference is that duplicate values can be added
43/// to a set, but it is an error to do so with maps. That is, a set can have
44/// the same key added sequentially, but a map can't.
45///
46/// # The future
47///
48/// It is regrettable that the output value is fixed to `u64`. Indeed, it is
49/// not necessary, but it was a major simplification in the implementation.
50/// In the future, the value type may become generic to an extent (outputs must
51/// satisfy a basic algebra).
52///
53/// Keys will always be byte strings; however, we may grow more conveniences
54/// around dealing with them (such as a serialization/deserialization step,
55/// although it isn't clear where exactly this should live).
56pub struct Map<Data>(raw::Fst<Data>);
57
58impl Map<Vec<u8>> {
59 /// Creates a map from its representation as a raw byte sequence.
60 ///
61 /// Note that this operation is very cheap (no allocations and no copies).
62 ///
63 /// The map must have been written with a compatible finite state
64 /// transducer builder (`MapBuilder` qualifies). If the format is invalid
65 /// or if there is a mismatch between the API version of this library
66 /// and the map, then an error is returned.
67 pub fn from_bytes(bytes: Vec<u8>) -> Result<Map<Vec<u8>>> {
68 raw::Fst::new(bytes).map(Map)
69 }
70
71 /// Create a `Map` from an iterator of lexicographically ordered byte
72 /// strings and associated values.
73 ///
74 /// If the iterator does not yield unique keys in lexicographic order, then
75 /// an error is returned.
76 ///
77 /// Note that this is a convenience function to build a map in memory.
78 /// To build a map that streams to an arbitrary `io::Write`, use
79 /// `MapBuilder`.
80 pub fn from_iter<K, I>(iter: I) -> Result<Self>
81 where
82 K: AsRef<[u8]>,
83 I: IntoIterator<Item = (K, u64)>,
84 {
85 let mut builder = MapBuilder::memory();
86 builder.extend_iter(iter)?;
87 Map::from_bytes(builder.into_inner()?)
88 }
89}
90
91impl<Data: Deref<Target = [u8]>> Map<Data> {
92 /// Tests the membership of a single key.
93 ///
94 /// # Example
95 ///
96 /// ```rust
97 /// use lindera_fst::Map;
98 ///
99 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
100 ///
101 /// assert_eq!(map.contains_key("b"), true);
102 /// assert_eq!(map.contains_key("z"), false);
103 /// ```
104 pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
105 self.0.contains_key(key)
106 }
107
108 /// Retrieves the value associated with a key.
109 ///
110 /// If the key does not exist, then `None` is returned.
111 ///
112 /// # Example
113 ///
114 /// ```rust
115 /// use lindera_fst::Map;
116 ///
117 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
118 ///
119 /// assert_eq!(map.get("b"), Some(2));
120 /// assert_eq!(map.get("z"), None);
121 /// ```
122 pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<u64> {
123 self.0.get(key).map(|output| output.value())
124 }
125
126 /// Return a lexicographically ordered stream of all key-value pairs in
127 /// this map.
128 ///
129 /// While this is a stream, it does require heap space proportional to the
130 /// longest key in the map.
131 ///
132 /// If the map is memory mapped, then no further heap space is needed.
133 /// Note though that your operating system may fill your page cache
134 /// (which will cause the resident memory usage of the process to go up
135 /// correspondingly).
136 ///
137 /// # Example
138 ///
139 /// Since streams are not iterators, the traditional `for` loop cannot be
140 /// used. `while let` is useful instead:
141 ///
142 /// ```rust
143 /// use lindera_fst::{IntoStreamer, Streamer, Map};
144 ///
145 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
146 /// let mut stream = map.stream();
147 ///
148 /// let mut kvs = vec![];
149 /// while let Some((k, v)) = stream.next() {
150 /// kvs.push((k.to_vec(), v));
151 /// }
152 /// assert_eq!(kvs, vec![
153 /// (b"a".to_vec(), 1),
154 /// (b"b".to_vec(), 2),
155 /// (b"c".to_vec(), 3),
156 /// ]);
157 /// ```
158 #[inline]
159 pub fn stream(&self) -> Stream {
160 Stream(self.0.stream())
161 }
162
163 /// Return a lexicographically ordered stream of all keys in this map.
164 ///
165 /// Memory requirements are the same as described on `Map::stream`.
166 ///
167 /// # Example
168 ///
169 /// ```rust
170 /// use lindera_fst::{IntoStreamer, Streamer, Map};
171 ///
172 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
173 /// let mut stream = map.keys();
174 ///
175 /// let mut keys = vec![];
176 /// while let Some(k) = stream.next() {
177 /// keys.push(k.to_vec());
178 /// }
179 /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
180 /// ```
181 #[inline]
182 pub fn keys(&self) -> Keys {
183 Keys(self.0.stream())
184 }
185
186 /// Return a stream of all values in this map ordered lexicographically
187 /// by each value's corresponding key.
188 ///
189 /// Memory requirements are the same as described on `Map::stream`.
190 ///
191 /// # Example
192 ///
193 /// ```rust
194 /// use lindera_fst::{IntoStreamer, Streamer, Map};
195 ///
196 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
197 /// let mut stream = map.values();
198 ///
199 /// let mut values = vec![];
200 /// while let Some(v) = stream.next() {
201 /// values.push(v);
202 /// }
203 /// assert_eq!(values, vec![1, 2, 3]);
204 /// ```
205 #[inline]
206 pub fn values(&self) -> Values {
207 Values(self.0.stream())
208 }
209
210 /// Return a builder for range queries.
211 ///
212 /// A range query returns a subset of key-value pairs in this map in a
213 /// range given in lexicographic order.
214 ///
215 /// Memory requirements are the same as described on `Map::stream`.
216 /// Notably, only the keys in the range are read; keys outside the range
217 /// are not.
218 ///
219 /// # Example
220 ///
221 /// Returns only the key-value pairs in the range given.
222 ///
223 /// ```rust
224 /// use lindera_fst::{IntoStreamer, Streamer, Map};
225 ///
226 /// let map = Map::from_iter(vec![
227 /// ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5),
228 /// ]).unwrap();
229 /// let mut stream = map.range().ge("b").lt("e").into_stream();
230 ///
231 /// let mut kvs = vec![];
232 /// while let Some((k, v)) = stream.next() {
233 /// kvs.push((k.to_vec(), v));
234 /// }
235 /// assert_eq!(kvs, vec![
236 /// (b"b".to_vec(), 2),
237 /// (b"c".to_vec(), 3),
238 /// (b"d".to_vec(), 4),
239 /// ]);
240 /// ```
241 #[inline]
242 pub fn range(&self) -> StreamBuilder {
243 StreamBuilder(self.0.range())
244 }
245
246 /// Executes an automaton on the keys of this map.
247 ///
248 /// Note that this returns a `StreamBuilder`, which can be used to
249 /// add a range query to the search (see the `range` method).
250 ///
251 /// Memory requirements are the same as described on `Map::stream`.
252 ///
253 /// # Example
254 ///
255 /// An implementation of regular expressions for `Automaton` is available
256 /// in the `fst-regex` crate, which can be used to search maps.
257 ///
258 /// ```rust
259 ///
260 /// use std::error::Error;
261 ///
262 /// use lindera_fst::{IntoStreamer, Streamer, Map};
263 /// use lindera_fst::Regex;
264 ///
265 /// # fn main() { example().unwrap(); }
266 /// fn example() -> Result<(), Box<Error>> {
267 /// let map = Map::from_iter(vec![
268 /// ("foo", 1), ("foo1", 2), ("foo2", 3), ("foo3", 4), ("foobar", 5),
269 /// ]).unwrap();
270 ///
271 /// let re = Regex::new("f[a-z]+3?").unwrap();
272 /// let mut stream = map.search(&re).into_stream();
273 ///
274 /// let mut kvs = vec![];
275 /// while let Some((k, v)) = stream.next() {
276 /// kvs.push((k.to_vec(), v));
277 /// }
278 /// assert_eq!(kvs, vec![
279 /// (b"foo".to_vec(), 1),
280 /// (b"foo3".to_vec(), 4),
281 /// (b"foobar".to_vec(), 5),
282 /// ]);
283 ///
284 /// Ok(())
285 /// }
286 /// ```
287 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
288 StreamBuilder(self.0.search(aut))
289 }
290
291 /// Returns the number of elements in this map.
292 #[inline]
293 pub fn len(&self) -> usize {
294 self.0.len()
295 }
296
297 /// Returns true if and only if this map is empty.
298 #[inline]
299 pub fn is_empty(&self) -> bool {
300 self.0.is_empty()
301 }
302
303 /// Creates a new map operation with this map added to it.
304 ///
305 /// The `OpBuilder` type can be used to add additional map streams
306 /// and perform set operations like union, intersection, difference and
307 /// symmetric difference on the keys of the map. These set operations also
308 /// allow one to specify how conflicting values are merged in the stream.
309 ///
310 /// # Example
311 ///
312 /// This example demonstrates a union on multiple map streams. Notice that
313 /// the stream returned from the union is not a sequence of key-value
314 /// pairs, but rather a sequence of keys associated with one or more
315 /// values. Namely, a key is associated with each value associated with
316 /// that same key in the all of the streams.
317 ///
318 /// ```rust
319 /// use lindera_fst::{Streamer, Map};
320 /// use lindera_fst::{map::IndexedValue};
321 ///
322 /// let map1 = Map::from_iter(vec![
323 /// ("a", 1), ("b", 2), ("c", 3),
324 /// ]).unwrap();
325 /// let map2 = Map::from_iter(vec![
326 /// ("a", 10), ("y", 11), ("z", 12),
327 /// ]).unwrap();
328 ///
329 /// let mut union = map1.op().add(&map2).union();
330 ///
331 /// let mut kvs = vec![];
332 /// while let Some((k, vs)) = union.next() {
333 /// kvs.push((k.to_vec(), vs.to_vec()));
334 /// }
335 /// assert_eq!(kvs, vec![
336 /// (b"a".to_vec(), vec![
337 /// IndexedValue { index: 0, value: 1 },
338 /// IndexedValue { index: 1, value: 10 },
339 /// ]),
340 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
341 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
342 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 11 }]),
343 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
344 /// ]);
345 /// ```
346 #[inline]
347 pub fn op(&self) -> OpBuilder {
348 OpBuilder::new().add(self)
349 }
350
351 /// Returns a reference to the underlying raw finite state transducer.
352 #[inline]
353 pub fn as_fst(&self) -> &raw::Fst<Data> {
354 &self.0
355 }
356}
357
358impl<Data: Deref<Target = [u8]>> fmt::Debug for Map<Data> {
359 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
360 write!(f, "Map([")?;
361 let mut stream = self.stream();
362 let mut first = true;
363 while let Some((k, v)) = stream.next() {
364 if !first {
365 write!(f, ", ")?;
366 }
367 first = false;
368 write!(f, "({}, {})", String::from_utf8_lossy(k), v)?;
369 }
370 write!(f, "])")
371 }
372}
373
374// Construct a map from an Fst object.
375impl<Data> From<raw::Fst<Data>> for Map<Data> {
376 #[inline]
377 fn from(fst: raw::Fst<Data>) -> Self {
378 Map(fst)
379 }
380}
381
382/// Returns the underlying finite state transducer.
383impl<Data> AsRef<raw::Fst<Data>> for Map<Data> {
384 #[inline]
385 fn as_ref(&self) -> &raw::Fst<Data> {
386 &self.0
387 }
388}
389
390impl<'m, 'a, Data: Deref<Target = [u8]>> IntoStreamer<'a> for &'m Map<Data> {
391 type Item = (&'a [u8], u64);
392 type Into = Stream<'m>;
393
394 #[inline]
395 fn into_stream(self) -> Self::Into {
396 Stream(self.0.stream())
397 }
398}
399
400/// A builder for creating a map.
401///
402/// This is not your average everyday builder. It has two important qualities
403/// that make it a bit unique from what you might expect:
404///
405/// 1. All keys must be added in lexicographic order. Adding a key out of order
406/// will result in an error. Additionally, adding a duplicate key will also
407/// result in an error. That is, once a key is associated with a value,
408/// that association can never be modified or deleted.
409/// 2. The representation of a map is streamed to *any* `io::Write` as it is
410/// built. For an in memory representation, this can be a `Vec<u8>`.
411///
412/// Point (2) is especially important because it means that a map can be
413/// constructed *without storing the entire map in memory*. Namely, since it
414/// works with any `io::Write`, it can be streamed directly to a file.
415///
416/// With that said, the builder does use memory, but **memory usage is bounded
417/// to a constant size**. The amount of memory used trades off with the
418/// compression ratio. Currently, the implementation hard codes this trade off
419/// which can result in about 5-20MB of heap usage during construction. (N.B.
420/// Guaranteeing a maximal compression ratio requires memory proportional to
421/// the size of the map, which defeats some of the benefit of streaming
422/// it to disk. In practice, a small bounded amount of memory achieves
423/// close-to-minimal compression ratios.)
424///
425/// The algorithmic complexity of map construction is `O(n)` where `n` is the
426/// number of elements added to the map.
427///
428/// # Example: build in memory
429///
430/// This shows how to use the builder to construct a map in memory. Note that
431/// `Map::from_iter` provides a convenience function that achieves this same
432/// goal without needing to explicitly use `MapBuilder`.
433///
434/// ```rust
435/// use lindera_fst::{IntoStreamer, Streamer, Map, MapBuilder};
436///
437/// let mut build = MapBuilder::memory();
438/// build.insert("bruce", 1).unwrap();
439/// build.insert("clarence", 2).unwrap();
440/// build.insert("stevie", 3).unwrap();
441///
442/// // You could also call `finish()` here, but since we're building the map in
443/// // memory, there would be no way to get the `Vec<u8>` back.
444/// let bytes = build.into_inner().unwrap();
445///
446/// // At this point, the map has been constructed, but here's how to read it.
447/// let map = Map::from_bytes(bytes).unwrap();
448/// let mut stream = map.into_stream();
449/// let mut kvs = vec![];
450/// while let Some((k, v)) = stream.next() {
451/// kvs.push((k.to_vec(), v));
452/// }
453/// assert_eq!(kvs, vec![
454/// (b"bruce".to_vec(), 1),
455/// (b"clarence".to_vec(), 2),
456/// (b"stevie".to_vec(), 3),
457/// ]);
458/// ```
459pub struct MapBuilder<W>(raw::Builder<W>);
460
461impl MapBuilder<Vec<u8>> {
462 /// Create a builder that builds a map in memory.
463 #[inline]
464 pub fn memory() -> Self {
465 MapBuilder(raw::Builder::memory())
466 }
467}
468
469impl<W: io::Write> MapBuilder<W> {
470 /// Create a builder that builds a map by writing it to `wtr` in a
471 /// streaming fashion.
472 pub fn new(wtr: W) -> Result<MapBuilder<W>> {
473 raw::Builder::new_type(wtr, 0).map(MapBuilder)
474 }
475
476 /// Insert a new key-value pair into the map.
477 ///
478 /// Keys must be convertible to byte strings. Values must be a `u64`, which
479 /// is a restriction of the current implementation of finite state
480 /// transducers. (Values may one day be expanded to other types.)
481 ///
482 /// If a key is inserted that is less than or equal to any previous key
483 /// added, then an error is returned. Similarly, if there was a problem
484 /// writing to the underlying writer, an error is returned.
485 pub fn insert<K: AsRef<[u8]>>(&mut self, key: K, val: u64) -> Result<()> {
486 self.0.insert(key, val)
487 }
488
489 /// Calls insert on each item in the iterator.
490 ///
491 /// If an error occurred while adding an element, processing is stopped
492 /// and the error is returned.
493 ///
494 /// If a key is inserted that is less than or equal to any previous key
495 /// added, then an error is returned. Similarly, if there was a problem
496 /// writing to the underlying writer, an error is returned.
497 pub fn extend_iter<K, I>(&mut self, iter: I) -> Result<()>
498 where
499 K: AsRef<[u8]>,
500 I: IntoIterator<Item = (K, u64)>,
501 {
502 self.0
503 .extend_iter(iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))))
504 }
505
506 /// Calls insert on each item in the stream.
507 ///
508 /// Note that unlike `extend_iter`, this is not generic on the items in
509 /// the stream.
510 ///
511 /// If a key is inserted that is less than or equal to any previous key
512 /// added, then an error is returned. Similarly, if there was a problem
513 /// writing to the underlying writer, an error is returned.
514 pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
515 where
516 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
517 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
518 {
519 self.0.extend_stream(StreamOutput(stream.into_stream()))
520 }
521
522 /// Finishes the construction of the map and flushes the underlying
523 /// writer. After completion, the data written to `W` may be read using
524 /// one of `Map`'s constructor methods.
525 pub fn finish(self) -> Result<()> {
526 self.0.finish()
527 }
528
529 /// Just like `finish`, except it returns the underlying writer after
530 /// flushing it.
531 pub fn into_inner(self) -> Result<W> {
532 self.0.into_inner()
533 }
534
535 /// Gets a reference to the underlying writer.
536 pub fn get_ref(&self) -> &W {
537 self.0.get_ref()
538 }
539
540 /// Returns the number of bytes written to the underlying writer
541 pub fn bytes_written(&self) -> u64 {
542 self.0.bytes_written()
543 }
544}
545
546/// A lexicographically ordered stream of key-value pairs from a map.
547///
548/// The `A` type parameter corresponds to an optional automaton to filter
549/// the stream. By default, no filtering is done.
550///
551/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
552pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>)
553where
554 A: Automaton;
555
556impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> {
557 type Item = (&'a [u8], u64);
558
559 fn next(&'a mut self) -> Option<Self::Item> {
560 self.0.next().map(|(key, out)| (key, out.value()))
561 }
562}
563
564impl<'m, A: Automaton> Stream<'m, A> {
565 /// Convert this stream into a vector of byte strings and outputs.
566 ///
567 /// Note that this creates a new allocation for every key in the stream.
568 pub fn into_byte_vec(self) -> Vec<(Vec<u8>, u64)> {
569 self.0.into_byte_vec()
570 }
571
572 /// Convert this stream into a vector of Unicode strings and outputs.
573 ///
574 /// If any key is not valid UTF-8, then iteration on the stream is stopped
575 /// and a UTF-8 decoding error is returned.
576 ///
577 /// Note that this creates a new allocation for every key in the stream.
578 pub fn into_str_vec(self) -> Result<Vec<(String, u64)>> {
579 self.0.into_str_vec()
580 }
581
582 /// Convert this stream into a vector of byte strings.
583 ///
584 /// Note that this creates a new allocation for every key in the stream.
585 pub fn into_byte_keys(self) -> Vec<Vec<u8>> {
586 self.0.into_byte_keys()
587 }
588
589 /// Convert this stream into a vector of Unicode strings.
590 ///
591 /// If any key is not valid UTF-8, then iteration on the stream is stopped
592 /// and a UTF-8 decoding error is returned.
593 ///
594 /// Note that this creates a new allocation for every key in the stream.
595 pub fn into_str_keys(self) -> Result<Vec<String>> {
596 self.0.into_str_keys()
597 }
598
599 /// Convert this stream into a vector of outputs.
600 pub fn into_values(self) -> Vec<u64> {
601 self.0.into_values()
602 }
603}
604
605/// A lexicographically ordered stream of keys from a map.
606///
607/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
608pub struct Keys<'m>(raw::Stream<'m>);
609
610impl<'a, 'm> Streamer<'a> for Keys<'m> {
611 type Item = &'a [u8];
612
613 #[inline]
614 fn next(&'a mut self) -> Option<Self::Item> {
615 self.0.next().map(|(key, _)| key)
616 }
617}
618
619/// A stream of values from a map, lexicographically ordered by each value's
620/// corresponding key.
621///
622/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
623pub struct Values<'m>(raw::Stream<'m>);
624
625impl<'a, 'm> Streamer<'a> for Values<'m> {
626 type Item = u64;
627
628 #[inline]
629 fn next(&'a mut self) -> Option<Self::Item> {
630 self.0.next().map(|(_, out)| out.value())
631 }
632}
633
634/// A builder for constructing range queries on streams.
635///
636/// Once all bounds are set, one should call `into_stream` to get a
637/// `Stream`.
638///
639/// Bounds are not additive. That is, if `ge` is called twice on the same
640/// builder, then the second setting wins.
641///
642/// The `A` type parameter corresponds to an optional automaton to filter
643/// the stream. By default, no filtering is done.
644///
645/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
646pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>);
647
648impl<'m, A: Automaton> StreamBuilder<'m, A> {
649 /// Specify a greater-than-or-equal-to bound.
650 pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> Self {
651 StreamBuilder(self.0.ge(bound))
652 }
653
654 /// Specify a greater-than bound.
655 pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> Self {
656 StreamBuilder(self.0.gt(bound))
657 }
658
659 /// Specify a less-than-or-equal-to bound.
660 pub fn le<T: AsRef<[u8]>>(self, bound: T) -> Self {
661 StreamBuilder(self.0.le(bound))
662 }
663
664 /// Specify a less-than bound.
665 pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> Self {
666 StreamBuilder(self.0.lt(bound))
667 }
668
669 /// Make it iterate backwards.
670 pub fn backward(self) -> Self {
671 StreamBuilder(self.0.backward())
672 }
673
674 /// Return this builder and gives the automaton states
675 /// along with the results.
676 pub fn with_state(self) -> StreamWithStateBuilder<'m, A> {
677 StreamWithStateBuilder(self.0.with_state())
678 }
679}
680
681impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> {
682 type Item = (&'a [u8], u64);
683 type Into = Stream<'m, A>;
684
685 fn into_stream(self) -> Self::Into {
686 Stream(self.0.into_stream())
687 }
688}
689
690/// A builder for constructing range queries of streams
691/// that returns results along with automaton states.
692///
693/// Once all bounds are set, one should call `into_stream` to get a
694/// `StreamWithState`.
695///
696/// Bounds are not additive. That is, if `ge` is called twice on the same
697/// builder, then the second setting wins.
698///
699/// The `A` type parameter corresponds to an optional automaton to filter
700/// the stream. By default, no filtering is done.
701///
702/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
703pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>(raw::StreamWithStateBuilder<'m, A>);
704
705impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'m, A>
706where
707 A::State: Clone,
708{
709 type Item = (&'a [u8], u64, A::State);
710 type Into = StreamWithState<'m, A>;
711
712 fn into_stream(self) -> Self::Into {
713 StreamWithState(self.0.into_stream())
714 }
715}
716
717/// A builder for collecting map streams on which to perform set operations
718/// on the keys of maps.
719///
720/// Set operations include intersection, union, difference and symmetric
721/// difference. The result of each set operation is itself a stream that emits
722/// pairs of keys and a sequence of each occurrence of that key in the
723/// participating streams. This information allows one to perform set
724/// operations on maps and customize how conflicting output values are handled.
725///
726/// All set operations work efficiently on an arbitrary number of
727/// streams with memory proportional to the number of streams.
728///
729/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
730/// where `n1, n2, n3, ...` correspond to the number of elements in each
731/// stream.
732///
733/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
734pub struct OpBuilder<'m>(raw::OpBuilder<'m>);
735
736impl<'m> OpBuilder<'m> {
737 /// Create a new set operation builder.
738 #[inline]
739 pub fn new() -> Self {
740 OpBuilder(raw::OpBuilder::default())
741 }
742
743 /// Add a stream to this set operation.
744 ///
745 /// This is useful for a chaining style pattern, e.g.,
746 /// `builder.add(stream1).add(stream2).union()`.
747 ///
748 /// The stream must emit a lexicographically ordered sequence of key-value
749 /// pairs.
750 pub fn add<I, S>(mut self, streamable: I) -> Self
751 where
752 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
753 S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
754 {
755 self.push(streamable);
756 self
757 }
758
759 /// Add a stream to this set operation.
760 ///
761 /// The stream must emit a lexicographically ordered sequence of key-value
762 /// pairs.
763 pub fn push<I, S>(&mut self, streamable: I)
764 where
765 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
766 S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
767 {
768 self.0.push(StreamOutput(streamable.into_stream()));
769 }
770
771 /// Performs a union operation on all streams that have been added.
772 ///
773 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
774 /// first element of the tuple is the byte string key. The second element
775 /// of the tuple is a list of all occurrences of that key in participating
776 /// streams. The `IndexedValue` contains an index and the value associated
777 /// with that key in that stream. The index uniquely identifies each
778 /// stream, which is an integer that is auto-incremented when a stream
779 /// is added to this operation (starting at `0`).
780 ///
781 /// # Example
782 ///
783 /// ```rust
784 /// use lindera_fst::{IntoStreamer, Streamer, Map};
785 /// use lindera_fst::map::IndexedValue;
786 ///
787 /// let map1 = Map::from_iter(vec![
788 /// ("a", 1), ("b", 2), ("c", 3),
789 /// ]).unwrap();
790 /// let map2 = Map::from_iter(vec![
791 /// ("a", 11), ("y", 12), ("z", 13),
792 /// ]).unwrap();
793 ///
794 /// let mut union = map1.op().add(&map2).union();
795 ///
796 /// let mut kvs = vec![];
797 /// while let Some((k, vs)) = union.next() {
798 /// kvs.push((k.to_vec(), vs.to_vec()));
799 /// }
800 /// assert_eq!(kvs, vec![
801 /// (b"a".to_vec(), vec![
802 /// IndexedValue { index: 0, value: 1 },
803 /// IndexedValue { index: 1, value: 11 },
804 /// ]),
805 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
806 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
807 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
808 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
809 /// ]);
810 /// ```
811 #[inline]
812 pub fn union(self) -> Union<'m> {
813 Union(self.0.union())
814 }
815
816 /// Performs an intersection operation on all streams that have been added.
817 ///
818 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
819 /// first element of the tuple is the byte string key. The second element
820 /// of the tuple is a list of all occurrences of that key in participating
821 /// streams. The `IndexedValue` contains an index and the value associated
822 /// with that key in that stream. The index uniquely identifies each
823 /// stream, which is an integer that is auto-incremented when a stream
824 /// is added to this operation (starting at `0`).
825 ///
826 /// # Example
827 ///
828 /// ```rust
829 /// use lindera_fst::{IntoStreamer, Streamer, Map};
830 /// use lindera_fst::map::IndexedValue;
831 ///
832 /// let map1 = Map::from_iter(vec![
833 /// ("a", 1), ("b", 2), ("c", 3),
834 /// ]).unwrap();
835 /// let map2 = Map::from_iter(vec![
836 /// ("a", 11), ("y", 12), ("z", 13),
837 /// ]).unwrap();
838 ///
839 /// let mut intersection = map1.op().add(&map2).intersection();
840 ///
841 /// let mut kvs = vec![];
842 /// while let Some((k, vs)) = intersection.next() {
843 /// kvs.push((k.to_vec(), vs.to_vec()));
844 /// }
845 /// assert_eq!(kvs, vec![
846 /// (b"a".to_vec(), vec![
847 /// IndexedValue { index: 0, value: 1 },
848 /// IndexedValue { index: 1, value: 11 },
849 /// ]),
850 /// ]);
851 /// ```
852 #[inline]
853 pub fn intersection(self) -> Intersection<'m> {
854 Intersection(self.0.intersection())
855 }
856
857 /// Performs a difference operation with respect to the first stream added.
858 /// That is, this returns a stream of all elements in the first stream
859 /// that don't exist in any other stream that has been added.
860 ///
861 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
862 /// first element of the tuple is the byte string key. The second element
863 /// of the tuple is a list of all occurrences of that key in participating
864 /// streams. The `IndexedValue` contains an index and the value associated
865 /// with that key in that stream. The index uniquely identifies each
866 /// stream, which is an integer that is auto-incremented when a stream
867 /// is added to this operation (starting at `0`).
868 ///
869 /// # Example
870 ///
871 /// ```rust
872 /// use lindera_fst::{Streamer, Map};
873 /// use lindera_fst::map::IndexedValue;
874 ///
875 /// let map1 = Map::from_iter(vec![
876 /// ("a", 1), ("b", 2), ("c", 3),
877 /// ]).unwrap();
878 /// let map2 = Map::from_iter(vec![
879 /// ("a", 11), ("y", 12), ("z", 13),
880 /// ]).unwrap();
881 ///
882 /// let mut difference = map1.op().add(&map2).difference();
883 ///
884 /// let mut kvs = vec![];
885 /// while let Some((k, vs)) = difference.next() {
886 /// kvs.push((k.to_vec(), vs.to_vec()));
887 /// }
888 /// assert_eq!(kvs, vec![
889 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
890 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
891 /// ]);
892 /// ```
893 #[inline]
894 pub fn difference(self) -> Difference<'m> {
895 Difference(self.0.difference())
896 }
897
898 /// Performs a symmetric difference operation on all of the streams that
899 /// have been added.
900 ///
901 /// When there are only two streams, then the keys returned correspond to
902 /// keys that are in either stream but *not* in both streams.
903 ///
904 /// More generally, for any number of streams, keys that occur in an odd
905 /// number of streams are returned.
906 ///
907 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
908 /// first element of the tuple is the byte string key. The second element
909 /// of the tuple is a list of all occurrences of that key in participating
910 /// streams. The `IndexedValue` contains an index and the value associated
911 /// with that key in that stream. The index uniquely identifies each
912 /// stream, which is an integer that is auto-incremented when a stream
913 /// is added to this operation (starting at `0`).
914 ///
915 /// # Example
916 ///
917 /// ```rust
918 /// use lindera_fst::{IntoStreamer, Streamer, Map};
919 /// use lindera_fst::map::IndexedValue;
920 ///
921 /// let map1 = Map::from_iter(vec![
922 /// ("a", 1), ("b", 2), ("c", 3),
923 /// ]).unwrap();
924 /// let map2 = Map::from_iter(vec![
925 /// ("a", 11), ("y", 12), ("z", 13),
926 /// ]).unwrap();
927 ///
928 /// let mut sym_difference = map1.op().add(&map2).symmetric_difference();
929 ///
930 /// let mut kvs = vec![];
931 /// while let Some((k, vs)) = sym_difference.next() {
932 /// kvs.push((k.to_vec(), vs.to_vec()));
933 /// }
934 /// assert_eq!(kvs, vec![
935 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
936 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
937 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
938 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
939 /// ]);
940 /// ```
941 #[inline]
942 pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
943 SymmetricDifference(self.0.symmetric_difference())
944 }
945}
946
947impl<'f, I, S> Extend<I> for OpBuilder<'f>
948where
949 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
950 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
951{
952 fn extend<T>(&mut self, it: T)
953 where
954 T: IntoIterator<Item = I>,
955 {
956 for stream in it {
957 self.push(stream);
958 }
959 }
960}
961
962impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
963where
964 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
965 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
966{
967 fn from_iter<T>(it: T) -> Self
968 where
969 T: IntoIterator<Item = I>,
970 {
971 let mut op = OpBuilder::new();
972 op.extend(it);
973 op
974 }
975}
976
977/// A stream of set union over multiple map streams in lexicographic order.
978///
979/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
980pub struct Union<'m>(raw::Union<'m>);
981
982impl<'a, 'm> Streamer<'a> for Union<'m> {
983 type Item = (&'a [u8], &'a [IndexedValue]);
984
985 #[inline]
986 fn next(&'a mut self) -> Option<Self::Item> {
987 self.0.next()
988 }
989}
990
991/// A stream of set intersection over multiple map streams in lexicographic
992/// order.
993///
994/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
995pub struct Intersection<'m>(raw::Intersection<'m>);
996
997impl<'a, 'm> Streamer<'a> for Intersection<'m> {
998 type Item = (&'a [u8], &'a [IndexedValue]);
999
1000 #[inline]
1001 fn next(&'a mut self) -> Option<Self::Item> {
1002 self.0.next()
1003 }
1004}
1005
1006/// A stream of set difference over multiple map streams in lexicographic
1007/// order.
1008///
1009/// The difference operation is taken with respect to the first stream and the
1010/// rest of the streams. i.e., All elements in the first stream that do not
1011/// appear in any other streams.
1012///
1013/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1014pub struct Difference<'m>(raw::Difference<'m>);
1015
1016impl<'a, 'm> Streamer<'a> for Difference<'m> {
1017 type Item = (&'a [u8], &'a [IndexedValue]);
1018
1019 #[inline]
1020 fn next(&'a mut self) -> Option<Self::Item> {
1021 self.0.next()
1022 }
1023}
1024
1025/// A stream of set symmetric difference over multiple map streams in
1026/// lexicographic order.
1027///
1028/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1029pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>);
1030
1031impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> {
1032 type Item = (&'a [u8], &'a [IndexedValue]);
1033
1034 #[inline]
1035 fn next(&'a mut self) -> Option<Self::Item> {
1036 self.0.next()
1037 }
1038}
1039
1040/// A specialized stream for mapping map streams (`(&[u8], u64)`) to streams
1041/// used by raw fsts (`(&[u8], Output)`).
1042///
1043/// If this were iterators, we could use `iter::Map`, but doing this on streams
1044/// requires HKT, so we need to write out the monomorphization ourselves.
1045struct StreamOutput<S>(S);
1046
1047impl<'a, S> Streamer<'a> for StreamOutput<S>
1048where
1049 S: Streamer<'a, Item = (&'a [u8], u64)>,
1050{
1051 type Item = (&'a [u8], raw::Output);
1052
1053 fn next(&'a mut self) -> Option<Self::Item> {
1054 self.0.next().map(|(k, v)| (k, raw::Output::new(v)))
1055 }
1056}
1057
1058/// A lexicographically ordered stream of key-value from a map
1059/// along with the states of the automaton.
1060///
1061/// The `Stream` type is based on the `StreamWithState`.
1062pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
1063where
1064 A: Automaton;
1065
1066impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
1067where
1068 A::State: Clone,
1069{
1070 type Item = (&'a [u8], u64, A::State);
1071
1072 fn next(&'a mut self) -> Option<Self::Item> {
1073 self.0
1074 .next()
1075 .map(|(key, out, state)| (key, out.value(), state))
1076 }
1077}