lindera_fst/raw/mod.rs
1/*!
2Operations on raw finite state transducers.
3
4This sub-module exposes the guts of a finite state transducer. Many parts of
5it, such as construction and traversal, are mirrored in the `set` and `map`
6sub-modules. Other parts of it, such as direct access to nodes and transitions
7in the transducer, do not have any analog.
8
9# Overview of types
10
11`Fst` is a read only interface to pre-constructed finite state transducers.
12`Node` is a read only interface to a single node in a transducer. `Builder` is
13used to create new finite state transducers. (Once a transducer is created, it
14can never be modified.) `Stream` is a stream of all inputs and outputs in a
15transducer. `StreamBuilder` builds range queries. `OpBuilder` collects streams
16and executes set operations like `union` or `intersection` on them with the
17option of specifying a merge strategy for output values.
18
19Most of the rest of the types are streams from set operations.
20*/
21use std::fmt;
22use std::ops::Deref;
23use std::{cmp, mem};
24
25use byteorder::{LittleEndian, ReadBytesExt};
26
27use crate::automaton::{AlwaysMatch, Automaton};
28use crate::error::Result;
29use crate::stream::{IntoStreamer, Streamer};
30
31pub use self::build::Builder;
32pub use self::error::Error;
33use self::node::node_new;
34pub use self::node::{Node, Transitions};
35pub use self::ops::{
36 Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, Union,
37};
38
39mod build;
40mod common_inputs;
41mod counting_writer;
42mod error;
43mod node;
44mod ops;
45mod pack;
46mod registry;
47mod registry_minimal;
48#[cfg(test)]
49mod tests;
50
51/// The API version of this crate.
52///
53/// This version number is written to every finite state transducer created by
54/// this crate. When a finite state transducer is read, its version number is
55/// checked against this value.
56///
57/// Currently, any version mismatch results in an error. Fixing this requires
58/// regenerating the finite state transducer or switching to a version of this
59/// crate that is compatible with the serialized transducer. This particular
60/// behavior may be relaxed in future versions.
61pub const VERSION: u64 = 2;
62
63/// A sentinel value used to indicate an empty final state.
64const EMPTY_ADDRESS: CompiledAddr = 0;
65
66/// A sentinel value used to indicate an invalid state.
67///
68/// This is never the address of a node in a serialized transducer.
69const NONE_ADDRESS: CompiledAddr = 1;
70
71/// Default capacity for the key buffer of a stream.
72const KEY_BUFFER_CAPACITY: usize = 128;
73
74/// FstType is a convention used to indicate the type of the underlying
75/// transducer.
76///
77/// This crate reserves the range 0-255 (inclusive) but currently leaves the
78/// meaning of 0-255 unspecified.
79pub type FstType = u64;
80
81/// CompiledAddr is the type used to address nodes in a finite state
82/// transducer.
83///
84/// It is most useful as a pointer to nodes. It can be used in the `Fst::node`
85/// method to resolve the pointer.
86pub type CompiledAddr = usize;
87
88/// An acyclic deterministic finite state transducer.
89///
90/// # How does it work?
91///
92/// The short answer: it's just like a prefix trie, which compresses keys
93/// based only on their prefixes, except that a automaton/transducer also
94/// compresses suffixes.
95///
96/// The longer answer is that keys in an automaton are stored only in the
97/// transitions from one state to another. A key can be acquired by tracing
98/// a path from the root of the automaton to any match state. The inputs along
99/// each transition are concatenated. Once a match state is reached, the
100/// concatenation of inputs up until that point corresponds to a single key.
101///
102/// But why is it called a transducer instead of an automaton? A finite state
103/// transducer is just like a finite state automaton, except that it has output
104/// transitions in addition to input transitions. Namely, the value associated
105/// with any particular key is determined by summing the outputs along every
106/// input transition that leads to the key's corresponding match state.
107///
108/// This is best demonstrated with a couple images. First, let's ignore the
109/// "transducer" aspect and focus on a plain automaton.
110///
111/// Consider that your keys are abbreviations of some of the months in the
112/// Gregorian calendar:
113///
114/// ```ignore
115/// jan
116/// feb
117/// mar
118/// apr
119/// may
120/// jun
121/// jul
122/// ```
123///
124/// The corresponding automaton that stores all of these as keys looks like
125/// this:
126///
127/// 
128///
129/// Notice here how the prefix and suffix of `jan` and `jun` are shared.
130/// Similarly, the prefixes of `jun` and `jul` are shared and the prefixes
131/// of `mar` and `may` are shared.
132///
133/// All of the keys from this automaton can be enumerated in lexicographic
134/// order by following every transition from each node in lexicographic
135/// order. Since it is acyclic, the procedure will terminate.
136///
137/// A key can be found by tracing it through the transitions in the automaton.
138/// For example, the key `aug` is known not to be in the automaton by only
139/// visiting the root state (because there is no `a` transition). For another
140/// example, the key `jax` is known not to be in the set only after moving
141/// through the transitions for `j` and `a`. Namely, after those transitions
142/// are followed, there are no transitions for `x`.
143///
144/// Notice here that looking up a key is proportional the length of the key
145/// itself. Namely, lookup time is not affected by the number of keys in the
146/// automaton!
147///
148/// Additionally, notice that the automaton exploits the fact that many keys
149/// share common prefixes and suffixes. For example, `jun` and `jul` are
150/// represented with no more states than would be required to represent either
151/// one on its own. Instead, the only change is a single extra transition. This
152/// is a form of compression and is key to how the automatons produced by this
153/// crate are so small.
154///
155/// Let's move on to finite state transducers. Consider the same set of keys
156/// as above, but let's assign their numeric month values:
157///
158/// ```ignore
159/// jan,1
160/// feb,2
161/// mar,3
162/// apr,4
163/// may,5
164/// jun,6
165/// jul,7
166/// ```
167///
168/// The corresponding transducer looks very similar to the automaton above,
169/// except outputs have been added to some of the transitions:
170///
171/// 
172///
173/// All of the operations with a transducer are the same as described above
174/// for automatons. Additionally, the same compression techniques are used:
175/// common prefixes and suffixes in keys are exploited.
176///
177/// The key difference is that some transitions have been given an output.
178/// As one follows input transitions, one must sum the outputs as they
179/// are seen. (A transition with no output represents the additive identity,
180/// or `0` in this case.) For example, when looking up `feb`, the transition
181/// `f` has output `2`, the transition `e` has output `0`, and the transition
182/// `b` also has output `0`. The sum of these is `2`, which is exactly the
183/// value we associated with `feb`.
184///
185/// For another more interesting example, consider `jul`. The `j` transition
186/// has output `1`, the `u` transition has output `5` and the `l` transition
187/// has output `1`. Summing these together gets us `7`, which is again the
188/// correct value associated with `jul`. Notice that if we instead looked up
189/// the `jun` key, then the `n` transition would be followed instead of the
190/// `l` transition, which has no output. Therefore, the `jun` key equals
191/// `1+5+0=6`.
192///
193/// The trick to transducers is that there exists a unique path through the
194/// transducer for every key, and its outputs are stored appropriately along
195/// this path such that the correct value is returned when they are all summed
196/// together. This process also enables the data that makes up each value to be
197/// shared across many values in the transducer in exactly the same way that
198/// keys are shared. This is yet another form of compression!
199///
200/// # Bonus: a billion strings
201///
202/// The amount of compression one can get from automata can be absolutely
203/// ridiuclous. Consider the particular case of storing all billion strings
204/// in the range `0000000001-1000000000`, e.g.,
205///
206/// ```ignore
207/// 0000000001
208/// 0000000002
209/// ...
210/// 0000000100
211/// 0000000101
212/// ...
213/// 0999999999
214/// 1000000000
215/// ```
216///
217/// The corresponding automaton looks like this:
218///
219/// ![finite state automaton - one billion strings]
220/// (http://burntsushi.net/stuff/one-billion.png)
221///
222/// Indeed, the on disk size of this automaton is a mere **251 bytes**.
223///
224/// Of course, this is a bit of a pathological best case, but it does serve
225/// to show how good compression can be in the optimal case.
226///
227/// Also, check out the
228/// [corresponding transducer](http://burntsushi.net/stuff/one-billion-map.svg)
229/// that maps each string to its integer value. It's a bit bigger, but still
230/// only takes up **896 bytes** of space on disk. This demonstrates that
231/// output values are also compressible.
232///
233/// # Does this crate produce minimal transducers?
234///
235/// For any non-trivial sized set of keys, it is unlikely that this crate will
236/// produce a minimal transducer. As far as this author knows, guaranteeing a
237/// minimal transducer requires working memory proportional to the number of
238/// states. This can be quite costly and is anathema to the main design goal of
239/// this crate: provide the ability to work with gigantic sets of strings with
240/// constant memory overhead.
241///
242/// Instead, construction of a finite state transducer uses a cache of
243/// states. More frequently used states are cached and reused, which provides
244/// reasonably good compression ratios. (No comprehensive benchmarks exist to
245/// back up this claim.)
246///
247/// It is possible that this crate may expose a way to guarantee minimal
248/// construction of transducers at the expense of exorbitant memory
249/// requirements.
250///
251/// # Bibliography
252///
253/// I initially got the idea to use finite state tranducers to represent
254/// ordered sets/maps from
255/// [Michael
256/// McCandless'](http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html)
257/// work on incorporating transducers in Lucene.
258///
259/// However, my work would also not have been possible without the hard work
260/// of many academics, especially
261/// [Jan Daciuk](http://galaxy.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/).
262///
263/// * [Incremental construction of minimal acyclic finite-state automata](http://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601)
264/// (Section 3 provides a decent overview of the algorithm used to construct
265/// transducers in this crate, assuming all outputs are `0`.)
266/// * [Direct Construction of Minimal Acyclic Subsequential Transducers](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.3698&rep=rep1&type=pdf)
267/// (The whole thing. The proof is dense but illuminating. The algorithm at
268/// the end is the money shot, namely, it incorporates output values.)
269/// * [Experiments with Automata Compression](http://www.researchgate.net/profile/Jii_Dvorsky/publication/221568039_Word_Random_Access_Compression/links/0c96052c095630d5b3000000.pdf#page=116), [Smaller Representation of Finite State Automata](http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf)
270/// (various compression techniques for representing states/transitions)
271/// * [Jan Daciuk's dissertation](http://www.pg.gda.pl/~jandac/thesis.ps.gz)
272/// (excellent for in depth overview)
273/// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](http://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf)
274/// (excellent for surface level overview)
275#[derive(Clone)]
276pub struct Fst<Data = Vec<u8>> {
277 meta: FstMeta,
278 data: Data,
279}
280
281#[derive(Clone)]
282struct FstMeta {
283 version: u64,
284 root_addr: CompiledAddr,
285 ty: FstType,
286 len: usize,
287}
288
289impl FstMeta {
290 #[inline(always)]
291 fn root<'f>(&self, data: &'f [u8]) -> Node<'f> {
292 self.node(self.root_addr, data)
293 }
294
295 #[inline(always)]
296 fn node<'f>(&self, addr: CompiledAddr, data: &'f [u8]) -> Node<'f> {
297 node_new(self.version, addr, data)
298 }
299
300 fn empty_final_output(&self, data: &[u8]) -> Option<Output> {
301 let root = self.root(data);
302 if root.is_final() {
303 Some(root.final_output())
304 } else {
305 None
306 }
307 }
308}
309
310impl<Data: Deref<Target = [u8]>> Fst<Data> {
311 /// Open a `Fst` from a given data.
312 pub fn new(data: Data) -> Result<Fst<Data>> {
313 if data.len() < 32 {
314 return Err(Error::Format.into());
315 }
316 // The read_u64 unwraps below are OK because they can never fail.
317 // They can only fail when there is an IO error or if there is an
318 // unexpected EOF. However, we are reading from a byte slice (no
319 // IO errors possible) and we've confirmed the byte slice is at least
320 // N bytes (no unexpected EOF).
321 let version = (&*data).read_u64::<LittleEndian>().unwrap();
322 if version == 0 || version > VERSION {
323 return Err(Error::Version {
324 expected: VERSION,
325 got: version,
326 }
327 .into());
328 }
329 let ty = (&data[8..]).read_u64::<LittleEndian>().unwrap();
330 let root_addr = {
331 let mut last = &data[data.len() - 8..];
332 u64_to_usize(last.read_u64::<LittleEndian>().unwrap())
333 };
334 let len = {
335 let mut last2 = &data[data.len() - 16..];
336 u64_to_usize(last2.read_u64::<LittleEndian>().unwrap())
337 };
338 // The root node is always the last node written, so its address should
339 // be near the end. After the root node is written, we still have to
340 // write the root *address* and the number of keys in the FST.
341 // That's 16 bytes. The extra byte comes from the fact that the root
342 // address points to the last byte in the root node, rather than the
343 // byte immediately following the root node.
344 //
345 // If this check passes, it is still possible that the FST is invalid
346 // but probably unlikely. If this check reports a false positive, then
347 // the program will probably panic. In the worst case, the FST will
348 // operate but be subtly wrong. (This would require the bytes to be in
349 // a format expected by an FST, which is incredibly unlikely.)
350 //
351 // The special check for EMPTY_ADDRESS is needed since an empty FST
352 // has a root node that is empty and final, which means it has the
353 // special address `0`. In that case, the FST is the smallest it can
354 // be: the version, type, root address and number of nodes. That's
355 // 32 bytes (8 byte u64 each).
356 //
357 // This is essentially our own little checksum.
358 if (root_addr == EMPTY_ADDRESS && data.len() != 32) && root_addr + 17 != data.len() {
359 return Err(Error::Format.into());
360 }
361 Ok(Fst {
362 data,
363 meta: FstMeta {
364 version,
365 root_addr,
366 ty,
367 len,
368 },
369 })
370 }
371
372 /// Retrieves the value associated with a key.
373 ///
374 /// If the key does not exist, then `None` is returned.
375 #[inline(never)]
376 pub fn get<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> {
377 let mut node = self.root();
378 let mut out = Output::zero();
379 for &b in key.as_ref() {
380 node = match node.find_input(b) {
381 None => return None,
382 Some(i) => {
383 let t = node.transition(i);
384 out = out.cat(t.out);
385 self.node(t.addr)
386 }
387 }
388 }
389 if !node.is_final() {
390 None
391 } else {
392 Some(out.cat(node.final_output()))
393 }
394 }
395
396 /// Returns true if and only if the given key is in this FST.
397 pub fn contains_key<B: AsRef<[u8]>>(&self, key: B) -> bool {
398 let mut node = self.root();
399 for &b in key.as_ref() {
400 node = match node.find_input(b) {
401 None => return false,
402 Some(i) => self.node(node.transition_addr(i)),
403 }
404 }
405 node.is_final()
406 }
407
408 /// Return a lexicographically ordered stream of all key-value pairs in
409 /// this fst.
410 #[inline]
411 pub fn stream(&self) -> Stream {
412 self.stream_builder(AlwaysMatch).into_stream()
413 }
414
415 fn stream_builder<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
416 StreamBuilder::new(&self.meta, &self.data, aut)
417 }
418
419 /// Return a builder for range queries.
420 ///
421 /// A range query returns a subset of key-value pairs in this fst in a
422 /// range given in lexicographic order.
423 #[inline]
424 pub fn range(&self) -> StreamBuilder {
425 self.stream_builder(AlwaysMatch)
426 }
427
428 /// Executes an automaton on the keys of this map.
429 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
430 self.stream_builder(aut)
431 }
432
433 /// Returns the number of keys in this fst.
434 #[inline]
435 pub fn len(&self) -> usize {
436 self.meta.len
437 }
438
439 /// Returns true if and only if this fst has no keys.
440 #[inline]
441 pub fn is_empty(&self) -> bool {
442 self.len() == 0
443 }
444
445 /// Returns the number of bytes used by this fst.
446 #[inline]
447 pub fn size(&self) -> usize {
448 self.data.len()
449 }
450
451 /// Creates a new fst operation with this fst added to it.
452 ///
453 /// The `OpBuilder` type can be used to add additional fst streams
454 /// and perform set operations like union, intersection, difference and
455 /// symmetric difference on the keys of the fst. These set operations also
456 /// allow one to specify how conflicting values are merged in the stream.
457 #[inline]
458 pub fn op(&self) -> OpBuilder {
459 OpBuilder::default().add(self)
460 }
461
462 /// Returns true if and only if the `self` fst is disjoint with the fst
463 /// `stream`.
464 ///
465 /// `stream` must be a lexicographically ordered sequence of byte strings
466 /// with associated values.
467 pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
468 where
469 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
470 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
471 {
472 self.op().add(stream).intersection().next().is_none()
473 }
474
475 /// Returns true if and only if the `self` fst is a subset of the fst
476 /// `stream`.
477 ///
478 /// `stream` must be a lexicographically ordered sequence of byte strings
479 /// with associated values.
480 pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
481 where
482 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
483 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
484 {
485 let mut op = self.op().add(stream).intersection();
486 let mut count = 0;
487 while let Some(_) = op.next() {
488 count += 1;
489 }
490 count == self.len()
491 }
492
493 /// Returns true if and only if the `self` fst is a superset of the fst
494 /// `stream`.
495 ///
496 /// `stream` must be a lexicographically ordered sequence of byte strings
497 /// with associated values.
498 pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
499 where
500 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
501 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
502 {
503 let mut op = self.op().add(stream).union();
504 let mut count = 0;
505 while let Some(_) = op.next() {
506 count += 1;
507 }
508 count == self.len()
509 }
510
511 /// Returns the underlying type of this fst.
512 ///
513 /// FstType is a convention used to indicate the type of the underlying
514 /// transducer.
515 ///
516 /// This crate reserves the range 0-255 (inclusive) but currently leaves
517 /// the meaning of 0-255 unspecified.
518 #[inline]
519 pub fn fst_type(&self) -> FstType {
520 self.meta.ty
521 }
522
523 /// Returns the root node of this fst.
524 #[inline(always)]
525 pub fn root(&self) -> Node {
526 self.meta.root(self.data.deref())
527 }
528
529 /// Returns the node at the given address.
530 ///
531 /// Node addresses can be obtained by reading transitions on `Node` values.
532 #[inline]
533 pub fn node(&self, addr: CompiledAddr) -> Node {
534 self.meta.node(addr, self.data.deref())
535 }
536
537 /// Returns a copy of the binary contents of this FST.
538 #[inline]
539 pub fn to_vec(&self) -> Vec<u8> {
540 self.data.to_vec()
541 }
542}
543
544impl<'a, 'f, Data> IntoStreamer<'a> for &'f Fst<Data>
545where
546 Data: Deref<Target = [u8]>,
547{
548 type Item = (&'a [u8], Output);
549 type Into = Stream<'f>;
550
551 #[inline]
552 fn into_stream(self) -> Self::Into {
553 self.stream()
554 }
555}
556
557/// A builder for constructing range queries on streams.
558///
559/// Once all bounds are set, one should call `into_stream` to get a
560/// `Stream`.
561///
562/// Bounds are not additive. That is, if `ge` is called twice on the same
563/// builder, then the second setting wins.
564///
565/// The `A` type parameter corresponds to an optional automaton to filter
566/// the stream. By default, no filtering is done.
567///
568/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
569pub struct StreamBuilder<'f, A = AlwaysMatch> {
570 meta: &'f FstMeta,
571 data: &'f [u8],
572 aut: A,
573 min: Bound,
574 max: Bound,
575 backward: bool,
576}
577
578impl<'f, A: Automaton> StreamBuilder<'f, A> {
579 fn new(meta: &'f FstMeta, data: &'f [u8], aut: A) -> Self {
580 StreamBuilder {
581 meta,
582 data,
583 aut,
584 min: Bound::Unbounded,
585 max: Bound::Unbounded,
586 backward: false,
587 }
588 }
589
590 /// Specify a greater-than-or-equal-to bound.
591 pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
592 self.min = Bound::Included(bound.as_ref().to_owned());
593 self
594 }
595
596 /// Specify a greater-than bound.
597 pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
598 self.min = Bound::Excluded(bound.as_ref().to_owned());
599 self
600 }
601
602 /// Specify a less-than-or-equal-to bound.
603 pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
604 self.max = Bound::Included(bound.as_ref().to_owned());
605 self
606 }
607
608 /// Specify a less-than bound.
609 pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
610 self.max = Bound::Excluded(bound.as_ref().to_owned());
611 self
612 }
613
614 /// Sets the `StreamBuilder` to stream the `(key, value)` backward.
615 pub fn backward(mut self) -> Self {
616 self.backward = true;
617 self
618 }
619
620 /// Return this builder and gives the automaton states
621 /// along with the results.
622 pub fn with_state(self) -> StreamWithStateBuilder<'f, A> {
623 StreamWithStateBuilder(self)
624 }
625}
626
627impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> {
628 type Item = (&'a [u8], Output);
629 type Into = Stream<'f, A>;
630
631 fn into_stream(self) -> Stream<'f, A> {
632 Stream::new(
633 self.meta,
634 self.data,
635 self.aut,
636 self.min,
637 self.max,
638 self.backward,
639 )
640 }
641}
642
643/// A builder for constructing range queries of streams
644/// that returns results along with automaton states.
645///
646/// Once all bounds are set, one should call `into_stream` to get a
647/// `StreamWithState`.
648///
649/// Bounds are not additive. That is, if `ge` is called twice on the same
650/// builder, then the second setting wins.
651///
652/// The `A` type parameter corresponds to an optional automaton to filter
653/// the stream. By default, no filtering is done.
654///
655/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
656pub struct StreamWithStateBuilder<'f, A = AlwaysMatch>(StreamBuilder<'f, A>);
657
658impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'f, A>
659where
660 A::State: Clone,
661{
662 type Item = (&'a [u8], Output, A::State);
663 type Into = StreamWithState<'f, A>;
664
665 fn into_stream(self) -> StreamWithState<'f, A> {
666 StreamWithState::new(
667 self.0.meta,
668 self.0.data,
669 self.0.aut,
670 self.0.min,
671 self.0.max,
672 self.0.backward,
673 )
674 }
675}
676
677#[derive(Clone, Debug)]
678enum Bound {
679 Included(Vec<u8>),
680 Excluded(Vec<u8>),
681 Unbounded,
682}
683
684impl Bound {
685 fn exceeded_by(&self, inp: &[u8]) -> bool {
686 match *self {
687 Bound::Included(ref v) => inp > v,
688 Bound::Excluded(ref v) => inp >= v,
689 Bound::Unbounded => false,
690 }
691 }
692
693 fn subceeded_by(&self, inp: &[u8]) -> bool {
694 match *self {
695 Bound::Included(ref v) => inp < v,
696 Bound::Excluded(ref v) => inp <= v,
697 Bound::Unbounded => false,
698 }
699 }
700
701 fn is_empty(&self) -> bool {
702 match *self {
703 Bound::Included(ref v) => v.is_empty(),
704 Bound::Excluded(ref v) => v.is_empty(),
705 Bound::Unbounded => true,
706 }
707 }
708
709 fn is_inclusive(&self) -> bool {
710 match *self {
711 Bound::Excluded(_) => false,
712 _ => true,
713 }
714 }
715}
716
717/// Stream of `key, value` not exposing the state of the automaton.
718pub struct Stream<'f, A = AlwaysMatch>(StreamWithState<'f, A>)
719where
720 A: Automaton;
721
722impl<'f, A: Automaton> Stream<'f, A> {
723 fn new(
724 meta: &'f FstMeta,
725 data: &'f [u8],
726 aut: A,
727 min: Bound,
728 max: Bound,
729 backward: bool,
730 ) -> Self {
731 Self(StreamWithState::new(meta, data, aut, min, max, backward))
732 }
733
734 /// Convert this stream into a vector of byte strings and outputs.
735 ///
736 /// Note that this creates a new allocation for every key in the stream.
737 pub fn into_byte_vec(mut self) -> Vec<(Vec<u8>, u64)> {
738 let mut vs = vec![];
739 while let Some((k, v)) = self.next() {
740 vs.push((k.to_vec(), v.value()));
741 }
742 vs
743 }
744
745 /// Convert this stream into a vector of Unicode strings and outputs.
746 ///
747 /// If any key is not valid UTF-8, then iteration on the stream is stopped
748 /// and a UTF-8 decoding error is returned.
749 ///
750 /// Note that this creates a new allocation for every key in the stream.
751 pub fn into_str_vec(mut self) -> Result<Vec<(String, u64)>> {
752 let mut vs = vec![];
753 while let Some((k, v)) = self.next() {
754 let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
755 vs.push((k, v.value()));
756 }
757 Ok(vs)
758 }
759
760 /// Convert this stream into a vector of byte strings.
761 ///
762 /// Note that this creates a new allocation for every key in the stream.
763 pub fn into_byte_keys(mut self) -> Vec<Vec<u8>> {
764 let mut vs = vec![];
765 while let Some((k, _)) = self.next() {
766 vs.push(k.to_vec());
767 }
768 vs
769 }
770
771 /// Convert this stream into a vector of Unicode strings.
772 ///
773 /// If any key is not valid UTF-8, then iteration on the stream is stopped
774 /// and a UTF-8 decoding error is returned.
775 ///
776 /// Note that this creates a new allocation for every key in the stream.
777 pub fn into_str_keys(mut self) -> Result<Vec<String>> {
778 let mut vs = vec![];
779 while let Some((k, _)) = self.next() {
780 let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
781 vs.push(k);
782 }
783 Ok(vs)
784 }
785
786 /// Convert this stream into a vector of outputs.
787 pub fn into_values(mut self) -> Vec<u64> {
788 let mut vs = vec![];
789 while let Some((_, v)) = self.next() {
790 vs.push(v.value());
791 }
792 vs
793 }
794}
795
796impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> {
797 type Item = (&'a [u8], Output);
798
799 fn next(&'a mut self) -> Option<Self::Item> {
800 self.0.next(|_| ()).map(|(key, out, _)| (key, out))
801 }
802}
803
804/// A lexicographically ordered stream from an fst
805/// of key-value pairs along with the state of the automaton.
806///
807/// The `A` type parameter corresponds to an optional automaton to filter
808/// the stream. By default, no filtering is done.
809///
810/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
811#[derive(Clone)]
812pub struct StreamWithState<'f, A = AlwaysMatch>
813where
814 A: Automaton,
815{
816 fst: &'f FstMeta,
817 data: &'f [u8],
818 aut: A,
819 inp: Buffer,
820 empty_output: Option<Output>,
821 stack: Vec<StreamState<'f, A::State>>,
822 end_at: Bound,
823 min: Bound,
824 max: Bound,
825 reversed: bool,
826}
827
828#[derive(Clone, Debug)]
829struct StreamState<'f, S> {
830 node: Node<'f>,
831 trans: usize,
832 out: Output,
833 aut_state: S,
834 done: bool, // ('done' = true) means that there are no unexplored transitions in the current state.
835 // 'trans' value should be ignored when done is true.
836}
837
838impl<'f, A: Automaton> StreamWithState<'f, A> {
839 fn new(
840 fst: &'f FstMeta,
841 data: &'f [u8],
842 aut: A,
843 min: Bound,
844 max: Bound,
845 backward: bool,
846 ) -> Self {
847 let min_2 = min.clone();
848 let max_2 = max.clone();
849 let end_at: Bound = if !backward { max.clone() } else { min.clone() };
850 let mut stream = StreamWithState {
851 fst,
852 data,
853 aut,
854 inp: Buffer::new(),
855 empty_output: None,
856 stack: vec![],
857 end_at,
858 min: min_2,
859 max: max_2,
860 reversed: backward,
861 };
862 stream.seek(&min, &max);
863 stream
864 }
865
866 /// Seeks the underlying stream such that the next key to be read is the
867 /// smallest key in the underlying fst that satisfies the given minimum
868 /// bound.
869 ///
870 /// This theoretically should be straight-forward, but we need to make
871 /// sure our stack is correct, which includes accounting for automaton
872 /// states.
873 fn seek(&mut self, min: &Bound, max: &Bound) {
874 let start_bound = if self.reversed { &max } else { &min };
875 if min.is_empty() && min.is_inclusive() {
876 self.empty_output = self.resolve_empty_output(min, max);
877 }
878 if start_bound.is_empty() {
879 self.stack.clear();
880 let node = self.fst.root(self.data);
881 let transition = self.starting_transition(&node);
882 self.stack = vec![StreamState {
883 node,
884 trans: transition.unwrap_or_default(),
885 out: Output::zero(),
886 aut_state: self.aut.start(),
887 done: transition.is_none(),
888 }];
889 return;
890 }
891 let (key, inclusive) = match start_bound {
892 Bound::Excluded(ref start_bound) => (start_bound, false),
893 Bound::Included(ref start_bound) => (start_bound, true),
894 Bound::Unbounded => unreachable!(),
895 };
896 // At this point, we need to find the starting location of `min` in
897 // the FST. However, as we search, we need to maintain a stack of
898 // reader states so that the reader can pick up where we left off.
899 // N.B. We do not necessarily need to stop in a final state, unlike
900 // the one-off `find` method. For the example, the given bound might
901 // not actually exist in the FST.
902 let mut node = self.fst.root(self.data);
903 let mut out = Output::zero();
904 let mut aut_state = self.aut.start();
905 for &b in key {
906 match node.find_input(b) {
907 Some(i) => {
908 let t = node.transition(i);
909 let prev_state = aut_state;
910 aut_state = self.aut.accept(&prev_state, b);
911 self.inp.push(b);
912 let transition = self.next_transition(&node, i);
913 self.stack.push(StreamState {
914 node,
915 trans: transition.unwrap_or_default(),
916 out,
917 aut_state: prev_state,
918 done: transition.is_none(),
919 });
920 out = out.cat(t.out);
921 node = self.fst.node(t.addr, self.data);
922 }
923 None => {
924 // This is a little tricky. We're in this case if the
925 // given bound is not a prefix of any key in the FST.
926 // Since this is a minimum bound, we need to find the
927 // first transition in this node that proceeds the current
928 // input byte.
929 let trans = self.transition_within_bound(&node, b);
930 self.stack.push(StreamState {
931 node,
932 trans: trans.unwrap_or_default(),
933 out,
934 aut_state,
935 done: trans.is_none(),
936 });
937 return;
938 }
939 }
940 }
941 if self.stack.is_empty() {
942 return;
943 }
944 let last = self.stack.len() - 1;
945 let state = &self.stack[last];
946 let transition = if !state.done {
947 self.previous_transition(&state.node, state.trans)
948 } else {
949 self.last_transition(&state.node)
950 };
951 if inclusive {
952 self.stack[last].trans = transition.unwrap_or_default();
953 self.stack[last].done = transition.is_none();
954 self.inp.pop();
955 } else {
956 let next_node = self.fst.node(
957 state.node.transition(transition.unwrap_or_default()).addr,
958 self.data,
959 );
960 let starting_transition = self.starting_transition(&next_node);
961 self.stack.push(StreamState {
962 node: next_node,
963 trans: starting_transition.unwrap_or_default(),
964 out,
965 aut_state,
966 done: starting_transition.is_none(),
967 });
968 }
969 }
970
971 #[inline]
972 fn next<F, T>(&mut self, transform: F) -> Option<(&[u8], Output, T)>
973 where
974 F: Fn(&A::State) -> T,
975 {
976 if !self.reversed {
977 // Inorder empty output (will be first).
978 if let Some(out) = self.empty_output.take() {
979 return Some((&[], out, transform(&self.aut.start())));
980 }
981 }
982 while let Some(state) = self.stack.pop() {
983 if state.done || !self.aut.can_match(&state.aut_state) {
984 if state.node.addr() != self.fst.root_addr {
985 // Reversed return next logic.
986 // If the stack is empty the value should not be returned.
987 if self.reversed && !self.stack.is_empty() && state.node.is_final() {
988 let out_of_bounds =
989 self.min.subceeded_by(&self.inp) || self.max.exceeded_by(&self.inp);
990 if !out_of_bounds && self.aut.is_match(&state.aut_state) {
991 return Some((&self.inp.pop(), state.out, transform(&state.aut_state)));
992 }
993 }
994 self.inp.pop();
995 }
996 continue;
997 }
998 let trans = state.node.transition(state.trans);
999 let out = state.out.cat(trans.out);
1000 let next_state = self.aut.accept(&state.aut_state, trans.inp);
1001 let is_match = self.aut.is_match(&next_state);
1002 let next_node = self.fst.node(trans.addr, self.data);
1003 self.inp.push(trans.inp);
1004 let current_transition = self.next_transition(&state.node, state.trans);
1005 self.stack.push(StreamState {
1006 trans: current_transition.unwrap_or_default(),
1007 done: current_transition.is_none(),
1008 ..state
1009 });
1010 let ns = transform(&next_state);
1011 let next_transition = self.starting_transition(&next_node);
1012 self.stack.push(StreamState {
1013 node: next_node,
1014 trans: next_transition.unwrap_or_default(),
1015 out,
1016 aut_state: next_state,
1017 done: next_transition.is_none(),
1018 });
1019 // Inorder return next logic.
1020 if !self.reversed {
1021 if self.end_at.exceeded_by(&self.inp) {
1022 // We are done, forever.
1023 self.stack.clear();
1024 return None;
1025 } else if !self.reversed && next_node.is_final() && is_match {
1026 return Some((&self.inp, out.cat(next_node.final_output()), ns));
1027 }
1028 }
1029 }
1030 // If we are streaming backward, we still need to return the empty output, if empty is
1031 // part of our fst, matches the range and the automaton
1032 self.empty_output
1033 .take()
1034 .map(|out| (&[][..], out, transform(&self.aut.start())))
1035 }
1036
1037 // The first transition that is in a bound for a given node.
1038 #[inline]
1039 fn transition_within_bound(&self, node: &Node<'f>, bound: u8) -> Option<usize> {
1040 let mut trans;
1041 if let Some(t) = self.starting_transition(&node) {
1042 trans = t;
1043 } else {
1044 return None;
1045 }
1046 loop {
1047 let transition = node.transition(trans);
1048 if (!self.reversed && transition.inp > bound)
1049 || (self.reversed && transition.inp < bound)
1050 {
1051 return Some(trans);
1052 } else if let Some(t) = self.next_transition(&node, trans) {
1053 trans = t;
1054 } else {
1055 return None;
1056 }
1057 }
1058 }
1059
1060 /// Resolves value of the empty output. Will be none if the empty output should not be returned.
1061 #[inline]
1062 fn resolve_empty_output(&mut self, min: &Bound, max: &Bound) -> Option<Output> {
1063 if min.subceeded_by(&[]) || max.exceeded_by(&[]) {
1064 return None;
1065 }
1066 let start = self.aut.start();
1067 if !self.aut.is_match(&start) {
1068 return None;
1069 }
1070 self.fst.empty_final_output(self.data)
1071 }
1072
1073 #[inline]
1074 fn starting_transition(&self, node: &Node<'f>) -> Option<usize> {
1075 if node.is_empty() {
1076 None
1077 } else if !self.reversed {
1078 Some(0)
1079 } else {
1080 Some(node.len() - 1)
1081 }
1082 }
1083
1084 #[inline]
1085 fn last_transition(&self, node: &Node<'f>) -> Option<usize> {
1086 if node.is_empty() {
1087 None
1088 } else if self.reversed {
1089 Some(0)
1090 } else {
1091 Some(node.len() - 1)
1092 }
1093 }
1094
1095 /// Returns the next transition.
1096 ///
1097 /// The concept of `next` transition is dependent on whether the stream is in reverse mode or
1098 /// not. If all the transitions of this node have been emitted, this method returns None.
1099 #[inline]
1100 fn next_transition(&self, node: &Node<'f>, current_transition: usize) -> Option<usize> {
1101 if self.reversed {
1102 Self::backward_transition(node, current_transition)
1103 } else {
1104 Self::forward_transition(node, current_transition)
1105 }
1106 }
1107
1108 /// See `StreamWithState::next_transition`.
1109 #[inline]
1110 fn previous_transition(&self, node: &Node<'f>, current_transition: usize) -> Option<usize> {
1111 if self.reversed {
1112 Self::forward_transition(node, current_transition)
1113 } else {
1114 Self::backward_transition(node, current_transition)
1115 }
1116 }
1117
1118 /// Returns the next logical transition.
1119 ///
1120 /// This is independent from whether the stream is in backward mode or not.
1121 #[inline]
1122 fn forward_transition(node: &Node<'f>, current_transition: usize) -> Option<usize> {
1123 if current_transition + 1 < node.len() {
1124 Some(current_transition + 1)
1125 } else {
1126 None
1127 }
1128 }
1129
1130 /// See [Stream::forward_transition].
1131 #[inline]
1132 fn backward_transition(node: &Node<'f>, current_transition: usize) -> Option<usize> {
1133 if current_transition > 0 && !node.is_empty() {
1134 Some(current_transition - 1)
1135 } else {
1136 None
1137 }
1138 }
1139}
1140
1141impl<'f, 'a, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A>
1142where
1143 A::State: Clone,
1144{
1145 type Item = (&'a [u8], Output, A::State);
1146
1147 fn next(&'a mut self) -> Option<Self::Item> {
1148 self.next(Clone::clone)
1149 }
1150}
1151
1152/// An output is a value that is associated with a key in a finite state
1153/// transducer.
1154///
1155/// Note that outputs must satisfy an algebra. Namely, it must have an additive
1156/// identity and the following binary operations defined: `prefix`,
1157/// `concatenation` and `subtraction`. `prefix` and `concatenation` are
1158/// commutative while `subtraction` is not. `subtraction` is only defined on
1159/// pairs of operands where the first operand is greater than or equal to the
1160/// second operand.
1161///
1162/// Currently, output values must be `u64`. However, in theory, an output value
1163/// can be anything that satisfies the above algebra. Future versions of this
1164/// crate may make outputs generic on this algebra.
1165#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
1166pub struct Output(u64);
1167
1168#[derive(Clone)]
1169struct Buffer {
1170 buf: Box<[u8]>,
1171 len: usize,
1172}
1173
1174impl Buffer {
1175 fn new() -> Self {
1176 Buffer {
1177 buf: vec![0u8; KEY_BUFFER_CAPACITY].into_boxed_slice(),
1178 len: 0,
1179 }
1180 }
1181
1182 fn capacity(&self) -> usize {
1183 self.buf.len()
1184 }
1185
1186 fn double_cap(&mut self) {
1187 let old_cap = self.capacity();
1188 let new_cap = old_cap * 2;
1189 let mut new_buf = vec![0u8; new_cap].into_boxed_slice();
1190 new_buf[..old_cap].copy_from_slice(&self.buf[..old_cap]);
1191 mem::replace(&mut self.buf, new_buf);
1192 }
1193
1194 fn push(&mut self, b: u8) {
1195 if self.capacity() <= self.len {
1196 self.double_cap();
1197 }
1198 self.buf[self.len] = b;
1199 self.len += 1;
1200 }
1201
1202 // Pops one byte and returns the entire chain before the byte was popped.
1203 fn pop(&mut self) -> &[u8] {
1204 let len = self.len;
1205 self.len = len - 1;
1206 &self.buf[..len]
1207 }
1208}
1209
1210impl Deref for Buffer {
1211 type Target = [u8];
1212
1213 fn deref(&self) -> &[u8] {
1214 &self.buf[..self.len]
1215 }
1216}
1217
1218impl Output {
1219 /// Create a new output from a `u64`.
1220 #[inline]
1221 pub fn new(v: u64) -> Output {
1222 Output(v)
1223 }
1224
1225 /// Create a zero output.
1226 #[inline]
1227 pub fn zero() -> Output {
1228 Output(0)
1229 }
1230
1231 /// Retrieve the value inside this output.
1232 #[inline]
1233 pub fn value(self) -> u64 {
1234 self.0
1235 }
1236
1237 /// Returns true if this is a zero output.
1238 #[inline]
1239 pub fn is_zero(self) -> bool {
1240 self.0 == 0
1241 }
1242
1243 /// Returns the prefix of this output and `o`.
1244 #[inline]
1245 pub fn prefix(self, o: Output) -> Output {
1246 Output(cmp::min(self.0, o.0))
1247 }
1248
1249 /// Returns the concatenation of this output and `o`.
1250 #[inline]
1251 pub fn cat(self, o: Output) -> Output {
1252 Output(self.0 + o.0)
1253 }
1254
1255 /// Returns the subtraction of `o` from this output.
1256 ///
1257 /// This function panics if `self > o`.
1258 #[inline]
1259 pub fn sub(self, o: Output) -> Output {
1260 Output(
1261 self.0
1262 .checked_sub(o.0)
1263 .expect("BUG: underflow subtraction not allowed"),
1264 )
1265 }
1266}
1267
1268/// A transition from one note to another.
1269#[derive(Copy, Clone, Hash, Eq, PartialEq)]
1270pub struct Transition {
1271 /// The byte input associated with this transition.
1272 pub inp: u8,
1273 /// The output associated with this transition.
1274 pub out: Output,
1275 /// The address of the node that this transition points to.
1276 pub addr: CompiledAddr,
1277}
1278
1279impl Default for Transition {
1280 #[inline]
1281 fn default() -> Self {
1282 Transition {
1283 inp: 0,
1284 out: Output::zero(),
1285 addr: NONE_ADDRESS,
1286 }
1287 }
1288}
1289
1290impl fmt::Debug for Transition {
1291 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1292 if self.out.is_zero() {
1293 write!(f, "{} -> {}", self.inp as char, self.addr)
1294 } else {
1295 write!(
1296 f,
1297 "({}, {}) -> {}",
1298 self.inp as char,
1299 self.out.value(),
1300 self.addr
1301 )
1302 }
1303 }
1304}
1305
1306#[inline]
1307#[cfg(target_pointer_width = "64")]
1308fn u64_to_usize(n: u64) -> usize {
1309 n as usize
1310}
1311
1312#[inline]
1313#[cfg(not(target_pointer_width = "64"))]
1314fn u64_to_usize(n: u64) -> usize {
1315 if n > ::std::usize::MAX as u64 {
1316 panic!(
1317 "\
1318Cannot convert node address {} to a pointer sized variable. If this FST
1319is very large and was generated on a system with a larger pointer size
1320than this system, then it is not possible to read this FST on this
1321system.",
1322 n
1323 );
1324 }
1325 n as usize
1326}