fst_no_std/raw/
build.rs

1use crate::bytes;
2use crate::error::Result;
3use crate::raw::counting_writer::CountingWriter;
4use crate::raw::error::Error;
5use crate::raw::registry::Registry;
6use crate::raw::registry::RegistryEntry;
7use crate::raw::Output;
8use crate::raw::{CompiledAddr, Transition};
9use crate::raw::{Fst, FstType, EMPTY_ADDRESS, NONE_ADDRESS, VERSION};
10use crate::stream::{IntoStreamer, Streamer};
11use alloc::{vec, vec::Vec};
12use std::io;
13
14/// A builder for creating a finite state transducer.
15///
16/// This is not your average everyday builder. It has two important qualities
17/// that make it a bit unique from what you might expect:
18///
19/// 1. All keys must be added in lexicographic order. Adding a key out of order
20///    will result in an error. Additionally, adding a duplicate key with an
21///    output value will also result in an error. That is, once a key is
22///    associated with a value, that association can never be modified or
23///    deleted.
24/// 2. The representation of an fst is streamed to *any* `io::Write` as it is
25///    built. For an in memory representation, this can be a `Vec<u8>`.
26///
27/// Point (2) is especially important because it means that an fst can be
28/// constructed *without storing the entire fst in memory*. Namely, since it
29/// works with any `io::Write`, it can be streamed directly to a file.
30///
31/// With that said, the builder does use memory, but **memory usage is bounded
32/// to a constant size**. The amount of memory used trades off with the
33/// compression ratio. Currently, the implementation hard codes this trade off
34/// which can result in about 5-20MB of heap usage during construction. (N.B.
35/// Guaranteeing a maximal compression ratio requires memory proportional to
36/// the size of the fst, which defeats some of the benefit of streaming
37/// it to disk. In practice, a small bounded amount of memory achieves
38/// close-to-minimal compression ratios.)
39///
40/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
41/// number of elements added to the fst.
42#[cfg(feature = "std")]
43pub struct Builder<W> {
44    /// The FST raw data is written directly to `wtr`.
45    ///
46    /// No internal buffering is done.
47    wtr: CountingWriter<W>,
48    /// The stack of unfinished nodes.
49    ///
50    /// An unfinished node is a node that could potentially have a new
51    /// transition added to it when a new word is added to the dictionary.
52    unfinished: UnfinishedNodes,
53    /// A map of finished nodes.
54    ///
55    /// A finished node is one that has been compiled and written to `wtr`.
56    /// After this point, the node is considered immutable and will never
57    /// change.
58    registry: Registry,
59    /// The last word added.
60    ///
61    /// This is used to enforce the invariant that words are added in sorted
62    /// order.
63    last: Option<Vec<u8>>,
64    /// The address of the last compiled node.
65    ///
66    /// This is used to optimize states with one transition that point
67    /// to the previously compiled node. (The previously compiled node in
68    /// this case actually corresponds to the next state for the transition,
69    /// since states are compiled in reverse.)
70    last_addr: CompiledAddr,
71    /// The number of keys added.
72    len: usize,
73}
74
75#[derive(Debug)]
76#[cfg(feature = "alloc")]
77struct UnfinishedNodes {
78    stack: Vec<BuilderNodeUnfinished>,
79}
80
81#[derive(Debug)]
82#[cfg(feature = "alloc")]
83struct BuilderNodeUnfinished {
84    node: BuilderNode,
85    last: Option<LastTransition>,
86}
87
88#[derive(Debug, Hash, Eq, PartialEq)]
89#[cfg(feature = "alloc")]
90pub struct BuilderNode {
91    pub is_final: bool,
92    pub final_output: Output,
93    pub trans: Vec<Transition>,
94}
95
96#[derive(Debug)]
97struct LastTransition {
98    inp: u8,
99    out: Output,
100}
101
102#[cfg(feature = "std")]
103impl Builder<Vec<u8>> {
104    /// Create a builder that builds an fst in memory.
105    #[inline]
106    #[must_use]
107    pub fn memory() -> Builder<Vec<u8>> {
108        Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
109    }
110
111    /// Finishes construction of the FST and returns it.
112    #[inline]
113    pub fn into_fst(self) -> Fst<Vec<u8>> {
114        self.into_inner().and_then(Fst::new).unwrap()
115    }
116}
117
118#[cfg(feature = "std")]
119impl<W: io::Write> Builder<W> {
120    /// Create a builder that builds an fst by writing it to `wtr` in a
121    /// streaming fashion.
122    pub fn new(wtr: W) -> Result<Builder<W>> {
123        Builder::new_type(wtr, 0)
124    }
125
126    /// The same as `new`, except it sets the type of the fst to the type
127    /// given.
128    pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
129        let mut wtr = CountingWriter::new(wtr);
130        // Don't allow any nodes to have address 0-7. We use these to encode
131        // the API version. We also use addresses `0` and `1` as special
132        // sentinel values, so they should never correspond to a real node.
133        bytes::io_write_u64_le(VERSION, &mut wtr)?;
134        // Similarly for 8-15 for the fst type.
135        bytes::io_write_u64_le(ty, &mut wtr)?;
136        Ok(Builder {
137            wtr,
138            unfinished: UnfinishedNodes::new(),
139            registry: Registry::new(10_000, 2),
140            last: None,
141            last_addr: NONE_ADDRESS,
142            len: 0,
143        })
144    }
145
146    /// Adds a byte string to this FST with a zero output value.
147    pub fn add<B>(&mut self, bs: B) -> Result<()>
148    where
149        B: AsRef<[u8]>,
150    {
151        self.check_last_key(bs.as_ref(), false)?;
152        self.insert_output(bs, None)
153    }
154
155    /// Insert a new key-value pair into the fst.
156    ///
157    /// Keys must be convertible to byte strings. Values must be a `u64`, which
158    /// is a restriction of the current implementation of finite state
159    /// transducers. (Values may one day be expanded to other types.)
160    ///
161    /// If a key is inserted that is less than or equal to any previous key
162    /// added, then an error is returned. Similarly, if there was a problem
163    /// writing to the underlying writer, an error is returned.
164    pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
165    where
166        B: AsRef<[u8]>,
167    {
168        self.check_last_key(bs.as_ref(), true)?;
169        self.insert_output(bs, Some(Output::new(val)))
170    }
171
172    /// Calls insert on each item in the iterator.
173    ///
174    /// If an error occurred while adding an element, processing is stopped
175    /// and the error is returned.
176    ///
177    /// If a key is inserted that is less than or equal to any previous key
178    /// added, then an error is returned. Similarly, if there was a problem
179    /// writing to the underlying writer, an error is returned.
180    pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
181    where
182        T: AsRef<[u8]>,
183        I: IntoIterator<Item = (T, Output)>,
184    {
185        for (key, out) in iter {
186            self.insert(key, out.value())?;
187        }
188        Ok(())
189    }
190
191    /// Calls insert on each item in the stream.
192    ///
193    /// Note that unlike `extend_iter`, this is not generic on the items in
194    /// the stream.
195    ///
196    /// If a key is inserted that is less than or equal to any previous key
197    /// added, then an error is returned. Similarly, if there was a problem
198    /// writing to the underlying writer, an error is returned.
199    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
200    where
201        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
202        S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
203    {
204        let mut stream = stream.into_stream();
205        while let Some((key, out)) = stream.next() {
206            self.insert(key, out.value())?;
207        }
208        Ok(())
209    }
210
211    /// Finishes the construction of the fst and flushes the underlying
212    /// writer. After completion, the data written to `W` may be read using
213    /// one of `Fst`'s constructor methods.
214    pub fn finish(self) -> Result<()> {
215        self.into_inner()?;
216        Ok(())
217    }
218
219    /// Just like `finish`, except it returns the underlying writer after
220    /// flushing it.
221    pub fn into_inner(mut self) -> Result<W> {
222        self.compile_from(0)?;
223        let root_node = self.unfinished.pop_root();
224        let root_addr = self.compile(&root_node)?;
225        bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
226        bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;
227
228        let sum = self.wtr.masked_checksum();
229        let mut wtr = self.wtr.into_inner();
230        bytes::io_write_u32_le(sum, &mut wtr)?;
231        wtr.flush()?;
232        Ok(wtr)
233    }
234
235    fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
236    where
237        B: AsRef<[u8]>,
238    {
239        let bs = bs.as_ref();
240        if bs.is_empty() {
241            self.len = 1; // must be first key, so length is always 1
242            self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
243            return Ok(());
244        }
245        let (prefix_len, out) = if let Some(out) = out {
246            self.unfinished.find_common_prefix_and_set_output(bs, out)
247        } else {
248            (self.unfinished.find_common_prefix(bs), Output::zero())
249        };
250        if prefix_len == bs.len() {
251            // If the prefix found consumes the entire set of bytes, then
252            // the prefix *equals* the bytes given. This means it is a
253            // duplicate value with no output. So we can give up here.
254            //
255            // If the below assert fails, then that means we let a duplicate
256            // value through even when inserting outputs.
257            assert!(out.is_zero());
258            return Ok(());
259        }
260        self.len += 1;
261        self.compile_from(prefix_len)?;
262        self.unfinished.add_suffix(&bs[prefix_len..], out);
263        Ok(())
264    }
265
266    fn compile_from(&mut self, istate: usize) -> Result<()> {
267        let mut addr = NONE_ADDRESS;
268        while istate + 1 < self.unfinished.len() {
269            let node = if addr == NONE_ADDRESS {
270                self.unfinished.pop_empty()
271            } else {
272                self.unfinished.pop_freeze(addr)
273            };
274            addr = self.compile(&node)?;
275            assert!(addr != NONE_ADDRESS);
276        }
277        self.unfinished.top_last_freeze(addr);
278        Ok(())
279    }
280
281    fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
282        if node.is_final
283            && node.trans.is_empty()
284            && node.final_output.is_zero()
285        {
286            return Ok(EMPTY_ADDRESS);
287        }
288        let entry = self.registry.entry(node);
289        if let RegistryEntry::Found(ref addr) = entry {
290            return Ok(*addr);
291        }
292        let start_addr = self.wtr.count() as CompiledAddr;
293        node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
294        self.last_addr = self.wtr.count() as CompiledAddr - 1;
295        if let RegistryEntry::NotFound(cell) = entry {
296            cell.insert(self.last_addr);
297        }
298        Ok(self.last_addr)
299    }
300
301    fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
302        if let Some(ref mut last) = self.last {
303            if check_dupe && bs == &**last {
304                return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
305            }
306            if bs < &**last {
307                return Err(Error::OutOfOrder {
308                    previous: last.clone(),
309                    got: bs.to_vec(),
310                }
311                .into());
312            }
313            last.clear();
314            for &b in bs {
315                last.push(b);
316            }
317        } else {
318            self.last = Some(bs.to_vec());
319        }
320        Ok(())
321    }
322
323    /// Gets a reference to the underlying writer.
324    pub fn get_ref(&self) -> &W {
325        self.wtr.get_ref()
326    }
327
328    /// Returns the number of bytes written to the underlying writer
329    pub fn bytes_written(&self) -> u64 {
330        self.wtr.count()
331    }
332}
333
334#[cfg(feature = "std")]
335impl UnfinishedNodes {
336    fn new() -> UnfinishedNodes {
337        let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
338        unfinished.push_empty(false);
339        unfinished
340    }
341
342    fn len(&self) -> usize {
343        self.stack.len()
344    }
345
346    fn push_empty(&mut self, is_final: bool) {
347        self.stack.push(BuilderNodeUnfinished {
348            node: BuilderNode { is_final, ..BuilderNode::default() },
349            last: None,
350        });
351    }
352
353    fn pop_root(&mut self) -> BuilderNode {
354        assert!(self.stack.len() == 1);
355        assert!(self.stack[0].last.is_none());
356        self.stack.pop().unwrap().node
357    }
358
359    fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
360        let mut unfinished = self.stack.pop().unwrap();
361        unfinished.last_compiled(addr);
362        unfinished.node
363    }
364
365    fn pop_empty(&mut self) -> BuilderNode {
366        let unfinished = self.stack.pop().unwrap();
367        assert!(unfinished.last.is_none());
368        unfinished.node
369    }
370
371    fn set_root_output(&mut self, out: Output) {
372        self.stack[0].node.is_final = true;
373        self.stack[0].node.final_output = out;
374    }
375
376    fn top_last_freeze(&mut self, addr: CompiledAddr) {
377        let last = self.stack.len().checked_sub(1).unwrap();
378        self.stack[last].last_compiled(addr);
379    }
380
381    fn add_suffix(&mut self, bs: &[u8], out: Output) {
382        if bs.is_empty() {
383            return;
384        }
385        let last = self.stack.len().checked_sub(1).unwrap();
386        assert!(self.stack[last].last.is_none());
387        self.stack[last].last = Some(LastTransition { inp: bs[0], out });
388        for &b in &bs[1..] {
389            self.stack.push(BuilderNodeUnfinished {
390                node: BuilderNode::default(),
391                last: Some(LastTransition { inp: b, out: Output::zero() }),
392            });
393        }
394        self.push_empty(true);
395    }
396
397    fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
398        bs.iter()
399            .zip(&self.stack)
400            .take_while(|&(&b, node)| {
401                node.last.as_ref().is_some_and(|t| t.inp == b)
402            })
403            .count()
404    }
405
406    fn find_common_prefix_and_set_output(
407        &mut self,
408        bs: &[u8],
409        mut out: Output,
410    ) -> (usize, Output) {
411        let mut i = 0;
412        while i < bs.len() {
413            let add_prefix = match self.stack[i].last.as_mut() {
414                Some(ref mut t) if t.inp == bs[i] => {
415                    i += 1;
416                    let common_pre = t.out.prefix(out);
417                    let add_prefix = t.out - common_pre;
418                    out = out - common_pre;
419                    t.out = common_pre;
420                    add_prefix
421                }
422                _ => break,
423            };
424            if !add_prefix.is_zero() {
425                self.stack[i].add_output_prefix(add_prefix);
426            }
427        }
428        (i, out)
429    }
430}
431
432#[cfg(feature = "std")]
433impl BuilderNodeUnfinished {
434    fn last_compiled(&mut self, addr: CompiledAddr) {
435        if let Some(trans) = self.last.take() {
436            self.node.trans.push(Transition {
437                inp: trans.inp,
438                out: trans.out,
439                addr,
440            });
441        }
442    }
443
444    fn add_output_prefix(&mut self, prefix: Output) {
445        if self.node.is_final {
446            self.node.final_output = prefix.cat(self.node.final_output);
447        }
448        for t in &mut self.node.trans {
449            t.out = prefix.cat(t.out);
450        }
451        if let Some(ref mut t) = self.last {
452            t.out = prefix.cat(t.out);
453        }
454    }
455}
456
457#[cfg(feature = "alloc")]
458impl Clone for BuilderNode {
459    fn clone(&self) -> BuilderNode {
460        BuilderNode {
461            is_final: self.is_final,
462            final_output: self.final_output,
463            trans: self.trans.clone(),
464        }
465    }
466
467    fn clone_from(&mut self, source: &BuilderNode) {
468        self.is_final = source.is_final;
469        self.final_output = source.final_output;
470        self.trans.clear();
471        self.trans.extend(source.trans.iter());
472    }
473}
474
475#[cfg(feature = "alloc")]
476impl Default for BuilderNode {
477    fn default() -> BuilderNode {
478        BuilderNode {
479            is_final: false,
480            final_output: Output::zero(),
481            trans: vec![],
482        }
483    }
484}