serbzip_core/codecs/balkanoid/
dict.rs

1//! The Balkanoid dictionary, as well as methods for populating it from
2//! an input stream (in text and binary modes) and for writing a binary
3//! image to a file.
4
5use crate::codecs::balkanoid::Reduction;
6use crate::succinct::{CowStr, Errorlike};
7use bincode::config;
8pub use bincode::error::{DecodeError, EncodeError};
9use std::cmp::Ordering;
10use std::collections::hash_map::Entry;
11use std::collections::HashMap;
12use std::io;
13use std::io::{Read, Write};
14
15/// Emitted when a word vector is about to overflow; i.e., an attempt was made to add more than 2^8 entries.
16pub type OverflowError = Errorlike<CowStr>;
17
18/// A vector of words that is capped in size to 2^8 entries, is always sorted and contains
19/// no duplicate entries.
20#[derive(Default, Debug, bincode::Encode, bincode::Decode, PartialEq, Eq)]
21pub struct WordVec(Vec<String>);
22
23impl WordVec {
24    /// Creates a new vector from a given iterable of string-like words.
25    ///
26    /// # Errors
27    /// [`OverflowError`] if this operation would result in a vector that exceeds 2^8 entries.
28    pub fn new(
29        words: impl IntoIterator<Item = impl Into<String>>,
30    ) -> Result<WordVec, OverflowError> {
31        let mut vec = WordVec::default();
32        for item in words {
33            vec.push(item)?;
34        }
35        Ok(vec)
36    }
37
38    /// Appends a string-like word to the vector.
39    ///
40    /// # Errors
41    /// [`OverflowError`] if this operation would result in a vector that exceeds 2^8 entries.
42    pub fn push(&mut self, word: impl Into<String>) -> Result<(), OverflowError> {
43        if self.0.len() == u8::MAX as usize {
44            Err(OverflowError::borrowed("too many words in vector"))
45        } else {
46            let word = word.into();
47            let position =  self
48                .0
49                .binary_search_by(|existing| comparator(existing, &word));
50            match position {
51                Ok(_) => Ok(()),
52                Err(position) => {
53                    self.0.insert(position, word);
54                    Ok(())
55                }
56            }
57        }
58    }
59}
60
61impl AsRef<Vec<String>> for WordVec {
62    fn as_ref(&self) -> &Vec<String> {
63        &self.0
64    }
65}
66
67impl From<WordVec> for Vec<String> {
68    fn from(vec: WordVec) -> Self {
69        vec.0
70    }
71}
72
73/// The dictionary used by [`Balkanoid`](super::Balkanoid).
74#[derive(Default, Debug, bincode::Encode, bincode::Decode, PartialEq, Eq)]
75pub struct Dict {
76    entries: HashMap<String, WordVec>,
77}
78
79/// Emitted when no word could be resolved at the specified position.
80pub type WordResolveError = Errorlike<CowStr>;
81
82fn comparator(lhs: &String, rhs: &String) -> Ordering {
83    lhs.len().cmp(&rhs.len()).then(lhs.cmp(rhs))
84}
85
86impl From<HashMap<String, WordVec>> for Dict {
87    /// Instantiates a dictionary from a given map.
88    ///
89    /// It assumes that the vectors are pre-sorted and there are no more than
90    /// [`u8::MAX`] words mapped from any fingerprint.
91    ///
92    /// # Examples
93    /// ```
94    /// use std::collections::HashMap;
95    /// use serbzip_core::codecs::balkanoid::Dict;
96    /// use serbzip_core::codecs::balkanoid::dict::WordVec;
97    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
98    /// let dict = Dict::from(HashMap::from(
99    ///     [
100    ///         (String::from("t"), WordVec::new([String::from("at"), String::from("it"), String::from("tea")])?),
101    ///         (String::from("n"), WordVec::new([String::from("in"), String::from("no"), String::from("on")])?)
102    ///     ]
103    /// ));
104    /// assert_eq!(6, dict.count());
105    /// # Ok(())
106    /// # }
107    /// ```
108    fn from(entries: HashMap<String, WordVec>) -> Self {
109        Self { entries }
110    }
111}
112
113/// Emitted when [`Dict::read_from_text_file`] encounters an error.
114#[derive(Debug)]
115pub enum ReadFromTextFileError {
116    Io(io::Error),
117    DictOverflow(OverflowError),
118}
119
120impl From<io::Error> for ReadFromTextFileError {
121    fn from(error: io::Error) -> Self {
122        Self::Io(error)
123    }
124}
125
126impl From<OverflowError> for ReadFromTextFileError {
127    fn from(error: OverflowError) -> Self {
128        Self::DictOverflow(error)
129    }
130}
131
132impl Dict {
133    /// Populates the dictionary from a collection of [`String`] words.
134    ///
135    /// # Errors
136    /// [`OverflowError`] if more than 2^8 words would end up associated with the same fingerprint.
137    pub fn populate(
138        &mut self,
139        words: impl IntoIterator<Item = String>,
140    ) -> Result<(), OverflowError> {
141        for word in words {
142            let reduction = Reduction::from(&word as &str).take_if_lowercase();
143            if let Some(Reduction { fingerprint, .. }) = reduction {
144                if !fingerprint.is_empty() {
145                    let mapped_words = match self.entries.entry(fingerprint) {
146                        Entry::Occupied(entry) => entry.into_mut(),
147                        Entry::Vacant(entry) => entry.insert(WordVec::default()),
148                    };
149                    mapped_words.push(word)?;
150                }
151            }
152        }
153        Ok(())
154    }
155
156    /// Obtains the aggregate count of words collated in this dictionary.
157    pub fn count(&self) -> usize {
158        self.entries.values().map(AsRef::as_ref).map(Vec::len).sum()
159    }
160
161    /// Resolves a collated word, given its fingerprint and position.
162    ///
163    /// If the fingerprint does not exist, returns `Ok(None)`. Otherwise, if the fingerprint
164    /// exists and there is a word at the given position, returns `Ok(Some(..))`, containing
165    /// the collated word. Otherwise, if the fingerprint exists, but no word is collated at
166    /// the given position, this method returns a [`WordResolveError`].
167    ///
168    /// # Errors
169    /// [`WordResolveError`] if the given fingerprint exists, but the position does not
170    /// resolve to a collated word.
171    pub(crate) fn resolve(
172        &self,
173        fingerprint: &str,
174        position: u8,
175    ) -> Result<Option<&String>, WordResolveError> {
176        match self.entries.get(fingerprint) {
177            None => Ok(None),
178            Some(entry) => match entry.as_ref().get(position as usize) {
179                None => Err(Errorlike::owned(format!(
180                    "no dictionary word at position {position} for fingerprint '{fingerprint}'"
181                ))),
182                Some(word) => Ok(Some(word)),
183            },
184        }
185    }
186
187    /// Obtains the position of a word, given its fingerprint.
188    ///
189    /// The position is returned as a [`Some(u8)`] if the fingerprint exists and the given word
190    /// is in the vector. Otherwise, [`None`] is returned.
191    pub(crate) fn position(&self, fingerprint: &str, word: &str) -> Option<u8> {
192        match self.entries.get(fingerprint) {
193            None => None,
194            Some(entry) => entry
195                .as_ref()
196                .iter()
197                .position(|existing| existing == word)
198                .map(|pos| u8::try_from(pos).unwrap()), // pos is guaranteed to be less than 2^8
199        }
200    }
201
202    /// Checks whether the given fingerprint exists in the dictionary.
203    pub(crate) fn contains_fingerprint(&self, fingerprint: &str) -> bool {
204        self.entries.contains_key(fingerprint)
205    }
206
207    /// Outputs the contents of the dictionary to the given writer, using the `bincode` protocol.
208    ///
209    /// # Errors
210    /// [`EncodeError`] if an error occurred during encoding.
211    pub fn write_to_binary_image(&self, w: &mut impl Write) -> Result<usize, EncodeError> {
212        bincode::encode_into_std_write(self, w, config::standard())
213    }
214
215    /// Loads a new dictionary from a given reader, using the `bincode` protocol.
216    ///
217    /// # Errors
218    /// [`DecodeError`] if an error occurred during decoding.
219    pub fn read_from_binary_image(r: &mut impl Read) -> Result<Dict, DecodeError> {
220        bincode::decode_from_std_read(r, config::standard())
221    }
222
223    /// Loads a new dictionary from a given reader, the latter containing a newline-delimited
224    /// wordlist.
225    ///
226    /// # Errors
227    /// [`ReadFromTextFileError`] if an I/O error occurs or if a vector inside the dictionary would overflow.
228    pub fn read_from_text_file(r: &mut impl Read) -> Result<Dict, ReadFromTextFileError> {
229        let mut buf = String::new();
230        r.read_to_string(&mut buf)?;
231        let mut dict = Dict::default();
232        for line in buf.lines() {
233            let line = line.split_whitespace();
234            dict.populate(line.map(ToOwned::to_owned))?;
235        }
236        Ok(dict)
237    }
238}
239
240#[cfg(test)]
241pub(in crate::codecs::balkanoid) mod tests;