serbzip_core/codecs/
balkanoid.rs1pub mod dict;
17
18use crate::codecs::balkanoid::dict::WordResolveError;
19use crate::codecs::Codec;
20pub use dict::Dict;
21use std::borrow::Cow;
22
23pub struct Balkanoid<'a> {
25 dict: &'a Dict,
26}
27
28impl<'a> Balkanoid<'a> {
29 pub fn new(dict: &'a Dict) -> Self {
31 Self { dict }
32 }
33}
34
35impl Codec for Balkanoid<'_> {
36 type ExpandError = WordResolveError;
37
38 fn compress_line(&self, line: &str) -> String {
39 let mut buf = String::new();
40 let words = line.split_whitespace();
42 for (index, word) in words.enumerate() {
44 if index > 0 {
45 buf.push(' ');
46 }
47 let compressed_word = compress_word(self.dict, word);
48 for _ in 0..compressed_word.leading_spaces {
49 buf.push(' ');
50 }
51 buf.push_str(&compressed_word.body);
52 }
53 buf
54 }
55
56 fn expand_line(&self, line: &str) -> Result<String, Self::ExpandError> {
57 let mut buf = String::new();
58 let words = EncodedWord::parse_line(line);
59 for (index, word) in words.into_iter().enumerate() {
61 if index > 0 {
62 buf.push(' ');
63 }
64 let expanded_word = expand_word(self.dict, word)?;
65 buf.push_str(&expanded_word);
66 }
67 Ok(buf)
68 }
69}
70
71#[derive(Debug, PartialEq)]
72struct Reduction {
73 fingerprint: String,
74 leading_capital: bool,
75 trailing_capitals: u8,
76}
77
78impl Reduction {
79 fn new(fingerprint: String, leading_capital: bool, trailing_capitals: u8) -> Self {
80 Reduction {
81 fingerprint,
82 leading_capital,
83 trailing_capitals,
84 }
85 }
86
87 fn is_lowercase(&self) -> bool {
88 !self.leading_capital && self.trailing_capitals == 0
89 }
90
91 fn take_if_lowercase(self) -> Option<Self> {
92 if self.is_lowercase() {
93 Some(self)
94 } else {
95 None
96 }
97 }
98}
99
100impl From<&str> for Reduction {
101 fn from(word: &str) -> Self {
102 let mut fingerprint = String::new();
103 let mut leading_capital = false;
104 let mut trailing_capitals = 0;
105 for (position, ch) in word.chars().enumerate() {
106 if ch.is_uppercase() {
107 match position {
108 0 => leading_capital = true,
109 _ => trailing_capitals += 1,
110 }
111
112 if !is_vowel(ch) {
113 fingerprint.push(ch.to_lowercase().next().unwrap());
114 }
115 } else if !is_vowel(ch) {
116 fingerprint.push(ch);
117 }
118 }
119 Reduction::new(fingerprint, leading_capital, trailing_capitals)
120 }
121}
122
123fn is_vowel(ch: char) -> bool {
124 matches!(
125 ch,
126 'a' | 'A'
127 | 'e'
128 | 'E'
129 | 'i'
130 | 'I'
131 | 'o'
132 | 'O'
133 | 'u'
134 | 'U'
135 | 'а'
136 | 'А'
137 | 'э'
138 | 'Э'
139 | 'ы'
140 | 'Ы'
141 | 'у'
142 | 'У'
143 | 'я'
144 | 'Я'
145 | 'е'
146 | 'Е'
147 | 'ё'
148 | 'Ё'
149 | 'ю'
150 | 'Ю'
151 | 'и'
152 | 'И'
153 | 'о'
154 | 'О'
155 )
156}
157
158#[derive(Debug, PartialEq)]
159struct EncodedWord {
160 leading_spaces: u8,
161 body: String,
162}
163
164impl EncodedWord {
165 fn new(leading_spaces: u8, body: String) -> Self {
166 assert!(!body.is_empty());
167 EncodedWord {
168 leading_spaces,
169 body,
170 }
171 }
172
173 fn parse_line(line: &str) -> Vec<EncodedWord> {
174 let mut buf = Some(String::new());
175 let mut leading_spaces: u8 = 0;
176 let chars = line.chars();
177 let mut words = Vec::new();
178 for ch in chars {
179 if ch == ' ' || ch == '\u{200E}' {
180 if buf.as_ref().unwrap().is_empty() {
182 leading_spaces += 1;
183 } else {
184 words.push(EncodedWord {
185 leading_spaces,
186 body: buf.replace(String::new()).unwrap(),
187 });
188 leading_spaces = 0;
189 }
190 } else {
191 buf.as_mut().unwrap().push(ch);
192 }
193 }
194
195 if !buf.as_ref().unwrap().is_empty() {
196 words.push(EncodedWord {
197 leading_spaces,
198 body: buf.take().unwrap(),
199 });
200 }
201 words
202 }
203}
204
205#[derive(Debug, PartialEq)]
206struct PunctuatedWord<'a> {
207 prefix: Cow<'a, str>,
208 suffix: Cow<'a, str>,
209}
210
211impl <'a> From<&'a str> for PunctuatedWord<'a> {
212 fn from(word: &'a str) -> Self {
213 let position = word.chars().enumerate().position(|(position, ch)| {
214 match position {
216 0 => !ch.is_alphabetic() && ch != '\\', _ => !ch.is_alphabetic(), }
219 });
220 match position {
222 None => PunctuatedWord {
223 prefix: Cow::Borrowed(word),
224 suffix: Cow::Borrowed(""),
225 },
226 Some(position) => {
227 let prefix = word.chars().take(position).collect::<String>();
228 let suffix = word.chars().skip(position).collect::<String>();
229 PunctuatedWord {
230 prefix: Cow::Owned(prefix),
231 suffix: Cow::Owned(suffix),
232 }
233 }
234 }
235 }
236}
237
238#[derive(Debug)]
239enum CompactionRule {
240 InDict,
241 NotInDictWithVowels,
242 NoFingerprintInDict,
243 Conflict,
244 LeadingEscape,
245}
246
247fn compress_word(dict: &Dict, word: &str) -> EncodedWord {
248 assert!(!word.is_empty());
249 let punctuated = PunctuatedWord::from(word);
250
251 let (encoded_prefix, _) = {
252 let first_char = punctuated.prefix.chars().next();
253 match first_char {
254 Some('\\') => {
255 (
257 (0, format!("\\{}", punctuated.prefix)),
258 CompactionRule::LeadingEscape,
259 )
260 }
261 _ => {
262 let prefix_reduction = Reduction::from(&punctuated.prefix as &str);
264 let lowercase_prefix = punctuated.prefix.to_lowercase();
266 match dict.position(&prefix_reduction.fingerprint, &lowercase_prefix) {
267 None => {
268 if punctuated.prefix.len() != prefix_reduction.fingerprint.len() {
269 (
271 (0, punctuated.prefix.into_owned()),
272 CompactionRule::NotInDictWithVowels,
273 )
274 } else if !dict.contains_fingerprint(&prefix_reduction.fingerprint) {
275 (
277 (0, punctuated.prefix.into_owned()),
278 CompactionRule::NoFingerprintInDict,
279 )
280 } else {
281 (
284 (0, format!("\\{}", punctuated.prefix)),
285 CompactionRule::Conflict,
286 )
287 }
288 }
289 Some(position) => {
290 let recapitalised_prefix = restore_capitalisation(
292 prefix_reduction.fingerprint,
293 prefix_reduction.leading_capital,
294 prefix_reduction.trailing_capitals != 0,
295 );
296 ((position, recapitalised_prefix), CompactionRule::InDict)
297 }
298 }
299 }
300 }
301 };
302 EncodedWord::new(encoded_prefix.0, encoded_prefix.1 + &punctuated.suffix)
304}
305
306fn restore_capitalisation(
307 lowercase_word: String,
308 leading_capital: bool,
309 nonleading_capital: bool,
310) -> String {
311 if nonleading_capital {
312 lowercase_word.to_uppercase()
313 } else if leading_capital {
314 let mut chars = lowercase_word.chars();
315 chars.next().unwrap().to_uppercase().to_string() + chars.as_str()
316 } else {
317 lowercase_word
318 }
319}
320
321const ESCAPE: u8 = b'\\';
322
323fn expand_word(dict: &Dict, word: EncodedWord) -> Result<String, WordResolveError> {
324 let punctuated = PunctuatedWord::from(word.body.as_str());
325 if punctuated.prefix.is_empty() {
326 return Ok(word.body);
327 }
328
329 let recapitalised_prefix = if punctuated.prefix.as_bytes()[0] == ESCAPE {
330 String::from(&punctuated.prefix[1..punctuated.prefix.len()])
332 } else {
333 let mut chars = punctuated.prefix.chars();
334 let leading_capital = chars.next().unwrap().is_uppercase();
335 let nonleading_capital = chars.next().map_or(false, char::is_uppercase);
336
337 if contains_vowels(&punctuated.prefix) {
338 punctuated.prefix.into_owned()
340 } else {
341 let lowercase_word = punctuated.prefix.to_lowercase();
342 match dict.resolve(&lowercase_word, word.leading_spaces)? {
343 None => {
344 punctuated.prefix.into_owned()
346 }
347 Some(resolved) => {
348 restore_capitalisation(resolved.clone(), leading_capital, nonleading_capital)
350 }
351 }
352 }
353 };
354
355 Ok(recapitalised_prefix + &punctuated.suffix)
356}
357
358fn contains_vowels(text: &str) -> bool {
359 text.chars().any(is_vowel)
360}
361
362#[cfg(test)]
363mod tests;