1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
//! The Balkanoid dictionary, as well as methods for populating it from
//! an input stream (in text and binary modes) and for writing a binary
//! image to a file.

use crate::codecs::balkanoid::Reduction;
use crate::succinct::{CowStr, Errorlike};
use bincode::config;
pub use bincode::error::{DecodeError, EncodeError};
use std::cmp::Ordering;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::io;
use std::io::{Read, Write};

/// Emitted when a word vector is about to overflow; i.e., an attempt was made to add more than 2^8 entries.
pub type OverflowError = Errorlike<CowStr>;

/// A vector of words that is capped in size to 2^8 entries, is always sorted and contains
/// no duplicate entries.
#[derive(Default, Debug, bincode::Encode, bincode::Decode, PartialEq, Eq)]
pub struct WordVec(Vec<String>);

impl WordVec {
    /// Creates a new vector from a given iterable of string-like words.
    ///
    /// # Errors
    /// [`OverflowError`] if this operation would result in a vector that exceeds 2^8 entries.
    pub fn new(
        words: impl IntoIterator<Item = impl Into<String>>,
    ) -> Result<WordVec, OverflowError> {
        let mut vec = WordVec::default();
        for item in words {
            vec.push(item)?;
        }
        Ok(vec)
    }

    /// Appends a string-like word to the vector.
    ///
    /// # Errors
    /// [`OverflowError`] if this operation would result in a vector that exceeds 2^8 entries.
    pub fn push(&mut self, word: impl Into<String>) -> Result<(), OverflowError> {
        if self.0.len() == u8::MAX as usize {
            Err(OverflowError::borrowed("too many words in vector"))
        } else {
            let word = word.into();
            let position =  self
                .0
                .binary_search_by(|existing| comparator(existing, &word));
            match position {
                Ok(_) => Ok(()),
                Err(position) => {
                    self.0.insert(position, word);
                    Ok(())
                }
            }
        }
    }
}

impl AsRef<Vec<String>> for WordVec {
    fn as_ref(&self) -> &Vec<String> {
        &self.0
    }
}

impl From<WordVec> for Vec<String> {
    fn from(vec: WordVec) -> Self {
        vec.0
    }
}

/// The dictionary used by [`Balkanoid`](super::Balkanoid).
#[derive(Default, Debug, bincode::Encode, bincode::Decode, PartialEq, Eq)]
pub struct Dict {
    entries: HashMap<String, WordVec>,
}

/// Emitted when no word could be resolved at the specified position.
pub type WordResolveError = Errorlike<CowStr>;

fn comparator(lhs: &String, rhs: &String) -> Ordering {
    lhs.len().cmp(&rhs.len()).then(lhs.cmp(rhs))
}

impl From<HashMap<String, WordVec>> for Dict {
    /// Instantiates a dictionary from a given map.
    ///
    /// It assumes that the vectors are pre-sorted and there are no more than
    /// [`u8::MAX`] words mapped from any fingerprint.
    ///
    /// # Examples
    /// ```
    /// use std::collections::HashMap;
    /// use serbzip_core::codecs::balkanoid::Dict;
    /// use serbzip_core::codecs::balkanoid::dict::WordVec;
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let dict = Dict::from(HashMap::from(
    ///     [
    ///         (String::from("t"), WordVec::new([String::from("at"), String::from("it"), String::from("tea")])?),
    ///         (String::from("n"), WordVec::new([String::from("in"), String::from("no"), String::from("on")])?)
    ///     ]
    /// ));
    /// assert_eq!(6, dict.count());
    /// # Ok(())
    /// # }
    /// ```
    fn from(entries: HashMap<String, WordVec>) -> Self {
        Self { entries }
    }
}

/// Emitted when [`Dict::read_from_text_file`] encounters an error.
#[derive(Debug)]
pub enum ReadFromTextFileError {
    Io(io::Error),
    DictOverflow(OverflowError),
}

impl From<io::Error> for ReadFromTextFileError {
    fn from(error: io::Error) -> Self {
        Self::Io(error)
    }
}

impl From<OverflowError> for ReadFromTextFileError {
    fn from(error: OverflowError) -> Self {
        Self::DictOverflow(error)
    }
}

impl Dict {
    /// Populates the dictionary from a collection of [`String`] words.
    ///
    /// # Errors
    /// [`OverflowError`] if more than 2^8 words would end up associated with the same fingerprint.
    pub fn populate(
        &mut self,
        words: impl IntoIterator<Item = String>,
    ) -> Result<(), OverflowError> {
        for word in words {
            let reduction = Reduction::from(&word as &str).take_if_lowercase();
            if let Some(Reduction { fingerprint, .. }) = reduction {
                if !fingerprint.is_empty() {
                    let mapped_words = match self.entries.entry(fingerprint) {
                        Entry::Occupied(entry) => entry.into_mut(),
                        Entry::Vacant(entry) => entry.insert(WordVec::default()),
                    };
                    mapped_words.push(word)?;
                }
            }
        }
        Ok(())
    }

    /// Obtains the aggregate count of words collated in this dictionary.
    pub fn count(&self) -> usize {
        self.entries.values().map(AsRef::as_ref).map(Vec::len).sum()
    }

    /// Resolves a collated word, given its fingerprint and position.
    ///
    /// If the fingerprint does not exist, returns `Ok(None)`. Otherwise, if the fingerprint
    /// exists and there is a word at the given position, returns `Ok(Some(..))`, containing
    /// the collated word. Otherwise, if the fingerprint exists, but no word is collated at
    /// the given position, this method returns a [`WordResolveError`].
    ///
    /// # Errors
    /// [`WordResolveError`] if the given fingerprint exists, but the position does not
    /// resolve to a collated word.
    pub(crate) fn resolve(
        &self,
        fingerprint: &str,
        position: u8,
    ) -> Result<Option<&String>, WordResolveError> {
        match self.entries.get(fingerprint) {
            None => Ok(None),
            Some(entry) => match entry.as_ref().get(position as usize) {
                None => Err(Errorlike::owned(format!(
                    "no dictionary word at position {position} for fingerprint '{fingerprint}'"
                ))),
                Some(word) => Ok(Some(word)),
            },
        }
    }

    /// Obtains the position of a word, given its fingerprint.
    ///
    /// The position is returned as a [`Some(u8)`] if the fingerprint exists and the given word
    /// is in the vector. Otherwise, [`None`] is returned.
    pub(crate) fn position(&self, fingerprint: &str, word: &str) -> Option<u8> {
        match self.entries.get(fingerprint) {
            None => None,
            Some(entry) => entry
                .as_ref()
                .iter()
                .position(|existing| existing == word)
                .map(|pos| u8::try_from(pos).unwrap()), // pos is guaranteed to be less than 2^8
        }
    }

    /// Checks whether the given fingerprint exists in the dictionary.
    pub(crate) fn contains_fingerprint(&self, fingerprint: &str) -> bool {
        self.entries.contains_key(fingerprint)
    }

    /// Outputs the contents of the dictionary to the given writer, using the `bincode` protocol.
    ///
    /// # Errors
    /// [`EncodeError`] if an error occurred during encoding.
    pub fn write_to_binary_image(&self, w: &mut impl Write) -> Result<usize, EncodeError> {
        bincode::encode_into_std_write(self, w, config::standard())
    }

    /// Loads a new dictionary from a given reader, using the `bincode` protocol.
    ///
    /// # Errors
    /// [`DecodeError`] if an error occurred during decoding.
    pub fn read_from_binary_image(r: &mut impl Read) -> Result<Dict, DecodeError> {
        bincode::decode_from_std_read(r, config::standard())
    }

    /// Loads a new dictionary from a given reader, the latter containing a newline-delimited
    /// wordlist.
    ///
    /// # Errors
    /// [`ReadFromTextFileError`] if an I/O error occurs or if a vector inside the dictionary would overflow.
    pub fn read_from_text_file(r: &mut impl Read) -> Result<Dict, ReadFromTextFileError> {
        let mut buf = String::new();
        r.read_to_string(&mut buf)?;
        let mut dict = Dict::default();
        for line in buf.lines() {
            let line = line.split_whitespace();
            dict.populate(line.map(ToOwned::to_owned))?;
        }
        Ok(dict)
    }
}

#[cfg(test)]
pub(in crate::codecs::balkanoid) mod tests;