serbzip_core/codecs/balkanoid/dict.rs
1//! The Balkanoid dictionary, as well as methods for populating it from
2//! an input stream (in text and binary modes) and for writing a binary
3//! image to a file.
4
5use crate::codecs::balkanoid::Reduction;
6use crate::succinct::{CowStr, Errorlike};
7use bincode::config;
8pub use bincode::error::{DecodeError, EncodeError};
9use std::cmp::Ordering;
10use std::collections::hash_map::Entry;
11use std::collections::HashMap;
12use std::io;
13use std::io::{Read, Write};
14
15/// Emitted when a word vector is about to overflow; i.e., an attempt was made to add more than 2^8 entries.
16pub type OverflowError = Errorlike<CowStr>;
17
18/// A vector of words that is capped in size to 2^8 entries, is always sorted and contains
19/// no duplicate entries.
20#[derive(Default, Debug, bincode::Encode, bincode::Decode, PartialEq, Eq)]
21pub struct WordVec(Vec<String>);
22
23impl WordVec {
24 /// Creates a new vector from a given iterable of string-like words.
25 ///
26 /// # Errors
27 /// [`OverflowError`] if this operation would result in a vector that exceeds 2^8 entries.
28 pub fn new(
29 words: impl IntoIterator<Item = impl Into<String>>,
30 ) -> Result<WordVec, OverflowError> {
31 let mut vec = WordVec::default();
32 for item in words {
33 vec.push(item)?;
34 }
35 Ok(vec)
36 }
37
38 /// Appends a string-like word to the vector.
39 ///
40 /// # Errors
41 /// [`OverflowError`] if this operation would result in a vector that exceeds 2^8 entries.
42 pub fn push(&mut self, word: impl Into<String>) -> Result<(), OverflowError> {
43 if self.0.len() == u8::MAX as usize {
44 Err(OverflowError::borrowed("too many words in vector"))
45 } else {
46 let word = word.into();
47 let position = self
48 .0
49 .binary_search_by(|existing| comparator(existing, &word));
50 match position {
51 Ok(_) => Ok(()),
52 Err(position) => {
53 self.0.insert(position, word);
54 Ok(())
55 }
56 }
57 }
58 }
59}
60
61impl AsRef<Vec<String>> for WordVec {
62 fn as_ref(&self) -> &Vec<String> {
63 &self.0
64 }
65}
66
67impl From<WordVec> for Vec<String> {
68 fn from(vec: WordVec) -> Self {
69 vec.0
70 }
71}
72
73/// The dictionary used by [`Balkanoid`](super::Balkanoid).
74#[derive(Default, Debug, bincode::Encode, bincode::Decode, PartialEq, Eq)]
75pub struct Dict {
76 entries: HashMap<String, WordVec>,
77}
78
79/// Emitted when no word could be resolved at the specified position.
80pub type WordResolveError = Errorlike<CowStr>;
81
82fn comparator(lhs: &String, rhs: &String) -> Ordering {
83 lhs.len().cmp(&rhs.len()).then(lhs.cmp(rhs))
84}
85
86impl From<HashMap<String, WordVec>> for Dict {
87 /// Instantiates a dictionary from a given map.
88 ///
89 /// It assumes that the vectors are pre-sorted and there are no more than
90 /// [`u8::MAX`] words mapped from any fingerprint.
91 ///
92 /// # Examples
93 /// ```
94 /// use std::collections::HashMap;
95 /// use serbzip_core::codecs::balkanoid::Dict;
96 /// use serbzip_core::codecs::balkanoid::dict::WordVec;
97 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
98 /// let dict = Dict::from(HashMap::from(
99 /// [
100 /// (String::from("t"), WordVec::new([String::from("at"), String::from("it"), String::from("tea")])?),
101 /// (String::from("n"), WordVec::new([String::from("in"), String::from("no"), String::from("on")])?)
102 /// ]
103 /// ));
104 /// assert_eq!(6, dict.count());
105 /// # Ok(())
106 /// # }
107 /// ```
108 fn from(entries: HashMap<String, WordVec>) -> Self {
109 Self { entries }
110 }
111}
112
113/// Emitted when [`Dict::read_from_text_file`] encounters an error.
114#[derive(Debug)]
115pub enum ReadFromTextFileError {
116 Io(io::Error),
117 DictOverflow(OverflowError),
118}
119
120impl From<io::Error> for ReadFromTextFileError {
121 fn from(error: io::Error) -> Self {
122 Self::Io(error)
123 }
124}
125
126impl From<OverflowError> for ReadFromTextFileError {
127 fn from(error: OverflowError) -> Self {
128 Self::DictOverflow(error)
129 }
130}
131
132impl Dict {
133 /// Populates the dictionary from a collection of [`String`] words.
134 ///
135 /// # Errors
136 /// [`OverflowError`] if more than 2^8 words would end up associated with the same fingerprint.
137 pub fn populate(
138 &mut self,
139 words: impl IntoIterator<Item = String>,
140 ) -> Result<(), OverflowError> {
141 for word in words {
142 let reduction = Reduction::from(&word as &str).take_if_lowercase();
143 if let Some(Reduction { fingerprint, .. }) = reduction {
144 if !fingerprint.is_empty() {
145 let mapped_words = match self.entries.entry(fingerprint) {
146 Entry::Occupied(entry) => entry.into_mut(),
147 Entry::Vacant(entry) => entry.insert(WordVec::default()),
148 };
149 mapped_words.push(word)?;
150 }
151 }
152 }
153 Ok(())
154 }
155
156 /// Obtains the aggregate count of words collated in this dictionary.
157 pub fn count(&self) -> usize {
158 self.entries.values().map(AsRef::as_ref).map(Vec::len).sum()
159 }
160
161 /// Resolves a collated word, given its fingerprint and position.
162 ///
163 /// If the fingerprint does not exist, returns `Ok(None)`. Otherwise, if the fingerprint
164 /// exists and there is a word at the given position, returns `Ok(Some(..))`, containing
165 /// the collated word. Otherwise, if the fingerprint exists, but no word is collated at
166 /// the given position, this method returns a [`WordResolveError`].
167 ///
168 /// # Errors
169 /// [`WordResolveError`] if the given fingerprint exists, but the position does not
170 /// resolve to a collated word.
171 pub(crate) fn resolve(
172 &self,
173 fingerprint: &str,
174 position: u8,
175 ) -> Result<Option<&String>, WordResolveError> {
176 match self.entries.get(fingerprint) {
177 None => Ok(None),
178 Some(entry) => match entry.as_ref().get(position as usize) {
179 None => Err(Errorlike::owned(format!(
180 "no dictionary word at position {position} for fingerprint '{fingerprint}'"
181 ))),
182 Some(word) => Ok(Some(word)),
183 },
184 }
185 }
186
187 /// Obtains the position of a word, given its fingerprint.
188 ///
189 /// The position is returned as a [`Some(u8)`] if the fingerprint exists and the given word
190 /// is in the vector. Otherwise, [`None`] is returned.
191 pub(crate) fn position(&self, fingerprint: &str, word: &str) -> Option<u8> {
192 match self.entries.get(fingerprint) {
193 None => None,
194 Some(entry) => entry
195 .as_ref()
196 .iter()
197 .position(|existing| existing == word)
198 .map(|pos| u8::try_from(pos).unwrap()), // pos is guaranteed to be less than 2^8
199 }
200 }
201
202 /// Checks whether the given fingerprint exists in the dictionary.
203 pub(crate) fn contains_fingerprint(&self, fingerprint: &str) -> bool {
204 self.entries.contains_key(fingerprint)
205 }
206
207 /// Outputs the contents of the dictionary to the given writer, using the `bincode` protocol.
208 ///
209 /// # Errors
210 /// [`EncodeError`] if an error occurred during encoding.
211 pub fn write_to_binary_image(&self, w: &mut impl Write) -> Result<usize, EncodeError> {
212 bincode::encode_into_std_write(self, w, config::standard())
213 }
214
215 /// Loads a new dictionary from a given reader, using the `bincode` protocol.
216 ///
217 /// # Errors
218 /// [`DecodeError`] if an error occurred during decoding.
219 pub fn read_from_binary_image(r: &mut impl Read) -> Result<Dict, DecodeError> {
220 bincode::decode_from_std_read(r, config::standard())
221 }
222
223 /// Loads a new dictionary from a given reader, the latter containing a newline-delimited
224 /// wordlist.
225 ///
226 /// # Errors
227 /// [`ReadFromTextFileError`] if an I/O error occurs or if a vector inside the dictionary would overflow.
228 pub fn read_from_text_file(r: &mut impl Read) -> Result<Dict, ReadFromTextFileError> {
229 let mut buf = String::new();
230 r.read_to_string(&mut buf)?;
231 let mut dict = Dict::default();
232 for line in buf.lines() {
233 let line = line.split_whitespace();
234 dict.populate(line.map(ToOwned::to_owned))?;
235 }
236 Ok(dict)
237 }
238}
239
240#[cfg(test)]
241pub(in crate::codecs::balkanoid) mod tests;