serbzip_core/codecs/
balkanoid.rs

1//! # Balkanoid
2//! _The codec that made Russians and Serbs inseparable._
3//!
4//! ## Historical background
5//! It has long been accepted that Serbian is a compact variant of Russian, with less liberal use
6//! of vowels. Since the forfeiting of the Riviera in 1991, the loss of tourism revenue has led
7//! to further austerity in vowel use. Serbs increasingly needed economically viable ways
8//! of communicating, since vowels aren't exactly cheap!
9//!
10//! # What is it?
11//! Balkanoid is a universal transcoder between Serbo-Croatian and Russian languages that is almost
12//! entirely isomorphic — it maps from one lingual domain to another with no loss of meaning and
13//! some loss of whitespace and capitalisation. Balkanoid works with both English and
14//! East-Slavic texts.
15
16pub mod dict;
17
18use crate::codecs::balkanoid::dict::WordResolveError;
19use crate::codecs::Codec;
20pub use dict::Dict;
21use std::borrow::Cow;
22
23/// The Balkanoid codec.
24pub struct Balkanoid<'a> {
25    dict: &'a Dict,
26}
27
28impl<'a> Balkanoid<'a> {
29    /// Creates a new Balkanoid codec that works off the given dictionary.
30    pub fn new(dict: &'a Dict) -> Self {
31        Self { dict }
32    }
33}
34
35impl Codec for Balkanoid<'_> {
36    type ExpandError = WordResolveError;
37
38    fn compress_line(&self, line: &str) -> String {
39        let mut buf = String::new();
40        // let words = Word::parse_line(line);
41        let words = line.split_whitespace();
42        // println!("words: {words:?}");
43        for (index, word) in words.enumerate() {
44            if index > 0 {
45                buf.push(' ');
46            }
47            let compressed_word = compress_word(self.dict, word);
48            for _ in 0..compressed_word.leading_spaces {
49                buf.push(' ');
50            }
51            buf.push_str(&compressed_word.body);
52        }
53        buf
54    }
55
56    fn expand_line(&self, line: &str) -> Result<String, Self::ExpandError> {
57        let mut buf = String::new();
58        let words = EncodedWord::parse_line(line);
59        // println!("words: {words:?}");
60        for (index, word) in words.into_iter().enumerate() {
61            if index > 0 {
62                buf.push(' ');
63            }
64            let expanded_word = expand_word(self.dict, word)?;
65            buf.push_str(&expanded_word);
66        }
67        Ok(buf)
68    }
69}
70
71#[derive(Debug, PartialEq)]
72struct Reduction {
73    fingerprint: String,
74    leading_capital: bool,
75    trailing_capitals: u8,
76}
77
78impl Reduction {
79    fn new(fingerprint: String, leading_capital: bool, trailing_capitals: u8) -> Self {
80        Reduction {
81            fingerprint,
82            leading_capital,
83            trailing_capitals,
84        }
85    }
86
87    fn is_lowercase(&self) -> bool {
88        !self.leading_capital && self.trailing_capitals == 0
89    }
90
91    fn take_if_lowercase(self) -> Option<Self> {
92        if self.is_lowercase() {
93            Some(self)
94        } else {
95            None
96        }
97    }
98}
99
100impl From<&str> for Reduction {
101    fn from(word: &str) -> Self {
102        let mut fingerprint = String::new();
103        let mut leading_capital = false;
104        let mut trailing_capitals = 0;
105        for (position, ch) in word.chars().enumerate() {
106            if ch.is_uppercase() {
107                match position {
108                    0 => leading_capital = true,
109                    _ => trailing_capitals += 1,
110                }
111
112                if !is_vowel(ch) {
113                    fingerprint.push(ch.to_lowercase().next().unwrap());
114                }
115            } else if !is_vowel(ch) {
116                fingerprint.push(ch);
117            }
118        }
119        Reduction::new(fingerprint, leading_capital, trailing_capitals)
120    }
121}
122
123fn is_vowel(ch: char) -> bool {
124    matches!(
125        ch,
126        'a' | 'A'
127            | 'e'
128            | 'E'
129            | 'i'
130            | 'I'
131            | 'o'
132            | 'O'
133            | 'u'
134            | 'U'
135            | 'а'
136            | 'А'
137            | 'э'
138            | 'Э'
139            | 'ы'
140            | 'Ы'
141            | 'у'
142            | 'У'
143            | 'я'
144            | 'Я'
145            | 'е'
146            | 'Е'
147            | 'ё'
148            | 'Ё'
149            | 'ю'
150            | 'Ю'
151            | 'и'
152            | 'И'
153            | 'о'
154            | 'О'
155    )
156}
157
158#[derive(Debug, PartialEq)]
159struct EncodedWord {
160    leading_spaces: u8,
161    body: String,
162}
163
164impl EncodedWord {
165    fn new(leading_spaces: u8, body: String) -> Self {
166        assert!(!body.is_empty());
167        EncodedWord {
168            leading_spaces,
169            body,
170        }
171    }
172
173    fn parse_line(line: &str) -> Vec<EncodedWord> {
174        let mut buf = Some(String::new());
175        let mut leading_spaces: u8 = 0;
176        let chars = line.chars();
177        let mut words = Vec::new();
178        for ch in chars {
179            if ch == ' ' || ch == '\u{200E}' {
180                // we also support the LRM codepoint
181                if buf.as_ref().unwrap().is_empty() {
182                    leading_spaces += 1;
183                } else {
184                    words.push(EncodedWord {
185                        leading_spaces,
186                        body: buf.replace(String::new()).unwrap(),
187                    });
188                    leading_spaces = 0;
189                }
190            } else {
191                buf.as_mut().unwrap().push(ch);
192            }
193        }
194
195        if !buf.as_ref().unwrap().is_empty() {
196            words.push(EncodedWord {
197                leading_spaces,
198                body: buf.take().unwrap(),
199            });
200        }
201        words
202    }
203}
204
205#[derive(Debug, PartialEq)]
206struct PunctuatedWord<'a> {
207    prefix: Cow<'a, str>,
208    suffix: Cow<'a, str>,
209}
210
211impl <'a> From<&'a str> for PunctuatedWord<'a> {
212    fn from(word: &'a str) -> Self {
213        let position = word.chars().enumerate().position(|(position, ch)| {
214            // println!("position: {position}, char: {ch}");
215            match position {
216                0 => !ch.is_alphabetic() && ch != '\\', // allow the escape character to be the first in the string
217                _ => !ch.is_alphabetic(), // otherwise, split on non-alphabetic characters
218            }
219        });
220        // println!("got position: {position:?}");
221        match position {
222            None => PunctuatedWord {
223                prefix: Cow::Borrowed(word),
224                suffix: Cow::Borrowed(""),
225            },
226            Some(position) => {
227                let prefix = word.chars().take(position).collect::<String>();
228                let suffix = word.chars().skip(position).collect::<String>();
229                PunctuatedWord {
230                    prefix: Cow::Owned(prefix),
231                    suffix: Cow::Owned(suffix),
232                }
233            }
234        }
235    }
236}
237
238#[derive(Debug)]
239enum CompactionRule {
240    InDict,
241    NotInDictWithVowels,
242    NoFingerprintInDict,
243    Conflict,
244    LeadingEscape,
245}
246
247fn compress_word(dict: &Dict, word: &str) -> EncodedWord {
248    assert!(!word.is_empty());
249    let punctuated = PunctuatedWord::from(word);
250
251    let (encoded_prefix, _) = {
252        let first_char = punctuated.prefix.chars().next();
253        match first_char {
254            Some('\\') => {
255                // the first character marks the start of an escape sequence
256                (
257                    (0, format!("\\{}", punctuated.prefix)),
258                    CompactionRule::LeadingEscape,
259                )
260            }
261            _ => {
262                // println!("punctuated: {punctuated:?}");
263                let prefix_reduction = Reduction::from(&punctuated.prefix as &str);
264                // println!("prefix reduction {prefix_reduction:?}");
265                let lowercase_prefix = punctuated.prefix.to_lowercase();
266                match dict.position(&prefix_reduction.fingerprint, &lowercase_prefix) {
267                    None => {
268                        if punctuated.prefix.len() != prefix_reduction.fingerprint.len() {
269                            // the input comprises one or more vowels
270                            (
271                                (0, punctuated.prefix.into_owned()),
272                                CompactionRule::NotInDictWithVowels,
273                            )
274                        } else if !dict.contains_fingerprint(&prefix_reduction.fingerprint) {
275                            // the input comprises only consonants and its fingerprint is not in the dict
276                            (
277                                (0, punctuated.prefix.into_owned()),
278                                CompactionRule::NoFingerprintInDict,
279                            )
280                        } else {
281                            // the input comprises only consonants and there are other words in the
282                            // dict with a matching fingerprint
283                            (
284                                (0, format!("\\{}", punctuated.prefix)),
285                                CompactionRule::Conflict,
286                            )
287                        }
288                    }
289                    Some(position) => {
290                        // the dictionary contains the lower-cased input
291                        let recapitalised_prefix = restore_capitalisation(
292                            prefix_reduction.fingerprint,
293                            prefix_reduction.leading_capital,
294                            prefix_reduction.trailing_capitals != 0,
295                        );
296                        ((position, recapitalised_prefix), CompactionRule::InDict)
297                    }
298                }
299            }
300        }
301    };
302    // println!("rule: {rule:?}");
303    EncodedWord::new(encoded_prefix.0, encoded_prefix.1 + &punctuated.suffix)
304}
305
306fn restore_capitalisation(
307    lowercase_word: String,
308    leading_capital: bool,
309    nonleading_capital: bool,
310) -> String {
311    if nonleading_capital {
312        lowercase_word.to_uppercase()
313    } else if leading_capital {
314        let mut chars = lowercase_word.chars();
315        chars.next().unwrap().to_uppercase().to_string() + chars.as_str()
316    } else {
317        lowercase_word
318    }
319}
320
321const ESCAPE: u8 = b'\\';
322
323fn expand_word(dict: &Dict, word: EncodedWord) -> Result<String, WordResolveError> {
324    let punctuated = PunctuatedWord::from(word.body.as_str());
325    if punctuated.prefix.is_empty() {
326        return Ok(word.body);
327    }
328
329    let recapitalised_prefix = if punctuated.prefix.as_bytes()[0] == ESCAPE {
330        // word begins with an escape sequence
331        String::from(&punctuated.prefix[1..punctuated.prefix.len()])
332    } else {
333        let mut chars = punctuated.prefix.chars();
334        let leading_capital = chars.next().unwrap().is_uppercase();
335        let nonleading_capital = chars.next().map_or(false, char::is_uppercase);
336
337        if contains_vowels(&punctuated.prefix) {
338            // word encoded with vowels
339            punctuated.prefix.into_owned()
340        } else {
341            let lowercase_word = punctuated.prefix.to_lowercase();
342            match dict.resolve(&lowercase_word, word.leading_spaces)? {
343                None => {
344                    // the fingerprint is not in the dictionary
345                    punctuated.prefix.into_owned()
346                }
347                Some(resolved) => {
348                    // resolved a word from the dictionary
349                    restore_capitalisation(resolved.clone(), leading_capital, nonleading_capital)
350                }
351            }
352        }
353    };
354
355    Ok(recapitalised_prefix + &punctuated.suffix)
356}
357
358fn contains_vowels(text: &str) -> bool {
359    text.chars().any(is_vowel)
360}
361
362#[cfg(test)]
363mod tests;
serbzip_core/codecs/balkanoid.rs

serbzip_core/codecs/
balkanoid.rs