pub mod dict;
use crate::codecs::balkanoid::dict::WordResolveError;
use crate::codecs::Codec;
pub use dict::Dict;
use std::borrow::Cow;
pub struct Balkanoid<'a> {
dict: &'a Dict,
}
impl<'a> Balkanoid<'a> {
pub fn new(dict: &'a Dict) -> Self {
Self { dict }
}
}
impl Codec for Balkanoid<'_> {
type ExpandError = WordResolveError;
fn compress_line(&self, line: &str) -> String {
let mut buf = String::new();
let words = line.split_whitespace();
for (index, word) in words.enumerate() {
if index > 0 {
buf.push(' ');
}
let compressed_word = compress_word(self.dict, word);
for _ in 0..compressed_word.leading_spaces {
buf.push(' ');
}
buf.push_str(&compressed_word.body);
}
buf
}
fn expand_line(&self, line: &str) -> Result<String, Self::ExpandError> {
let mut buf = String::new();
let words = EncodedWord::parse_line(line);
for (index, word) in words.into_iter().enumerate() {
if index > 0 {
buf.push(' ');
}
let expanded_word = expand_word(self.dict, word)?;
buf.push_str(&expanded_word);
}
Ok(buf)
}
}
#[derive(Debug, PartialEq)]
struct Reduction {
fingerprint: String,
leading_capital: bool,
trailing_capitals: u8,
}
impl Reduction {
fn new(fingerprint: String, leading_capital: bool, trailing_capitals: u8) -> Self {
Reduction {
fingerprint,
leading_capital,
trailing_capitals,
}
}
fn is_lowercase(&self) -> bool {
!self.leading_capital && self.trailing_capitals == 0
}
fn take_if_lowercase(self) -> Option<Self> {
if self.is_lowercase() {
Some(self)
} else {
None
}
}
}
impl From<&str> for Reduction {
fn from(word: &str) -> Self {
let mut fingerprint = String::new();
let mut leading_capital = false;
let mut trailing_capitals = 0;
for (position, ch) in word.chars().enumerate() {
if ch.is_uppercase() {
match position {
0 => leading_capital = true,
_ => trailing_capitals += 1,
}
if !is_vowel(ch) {
fingerprint.push(ch.to_lowercase().next().unwrap());
}
} else if !is_vowel(ch) {
fingerprint.push(ch);
}
}
Reduction::new(fingerprint, leading_capital, trailing_capitals)
}
}
fn is_vowel(ch: char) -> bool {
matches!(
ch,
'a' | 'A'
| 'e'
| 'E'
| 'i'
| 'I'
| 'o'
| 'O'
| 'u'
| 'U'
| 'а'
| 'А'
| 'э'
| 'Э'
| 'ы'
| 'Ы'
| 'у'
| 'У'
| 'я'
| 'Я'
| 'е'
| 'Е'
| 'ё'
| 'Ё'
| 'ю'
| 'Ю'
| 'и'
| 'И'
| 'о'
| 'О'
)
}
#[derive(Debug, PartialEq)]
struct EncodedWord {
leading_spaces: u8,
body: String,
}
impl EncodedWord {
fn new(leading_spaces: u8, body: String) -> Self {
assert!(!body.is_empty());
EncodedWord {
leading_spaces,
body,
}
}
fn parse_line(line: &str) -> Vec<EncodedWord> {
let mut buf = Some(String::new());
let mut leading_spaces: u8 = 0;
let chars = line.chars();
let mut words = Vec::new();
for ch in chars {
if ch == ' ' || ch == '\u{200E}' {
if buf.as_ref().unwrap().is_empty() {
leading_spaces += 1;
} else {
words.push(EncodedWord {
leading_spaces,
body: buf.replace(String::new()).unwrap(),
});
leading_spaces = 0;
}
} else {
buf.as_mut().unwrap().push(ch);
}
}
if !buf.as_ref().unwrap().is_empty() {
words.push(EncodedWord {
leading_spaces,
body: buf.take().unwrap(),
});
}
words
}
}
#[derive(Debug, PartialEq)]
struct PunctuatedWord<'a> {
prefix: Cow<'a, str>,
suffix: Cow<'a, str>,
}
impl <'a> From<&'a str> for PunctuatedWord<'a> {
fn from(word: &'a str) -> Self {
let position = word.chars().enumerate().position(|(position, ch)| {
match position {
0 => !ch.is_alphabetic() && ch != '\\', _ => !ch.is_alphabetic(), }
});
match position {
None => PunctuatedWord {
prefix: Cow::Borrowed(word),
suffix: Cow::Borrowed(""),
},
Some(position) => {
let prefix = word.chars().take(position).collect::<String>();
let suffix = word.chars().skip(position).collect::<String>();
PunctuatedWord {
prefix: Cow::Owned(prefix),
suffix: Cow::Owned(suffix),
}
}
}
}
}
#[derive(Debug)]
enum CompactionRule {
InDict,
NotInDictWithVowels,
NoFingerprintInDict,
Conflict,
LeadingEscape,
}
fn compress_word(dict: &Dict, word: &str) -> EncodedWord {
assert!(!word.is_empty());
let punctuated = PunctuatedWord::from(word);
let (encoded_prefix, _) = {
let first_char = punctuated.prefix.chars().next();
match first_char {
Some('\\') => {
(
(0, format!("\\{}", punctuated.prefix)),
CompactionRule::LeadingEscape,
)
}
_ => {
let prefix_reduction = Reduction::from(&punctuated.prefix as &str);
let lowercase_prefix = punctuated.prefix.to_lowercase();
match dict.position(&prefix_reduction.fingerprint, &lowercase_prefix) {
None => {
if punctuated.prefix.len() != prefix_reduction.fingerprint.len() {
(
(0, punctuated.prefix.into_owned()),
CompactionRule::NotInDictWithVowels,
)
} else if !dict.contains_fingerprint(&prefix_reduction.fingerprint) {
(
(0, punctuated.prefix.into_owned()),
CompactionRule::NoFingerprintInDict,
)
} else {
(
(0, format!("\\{}", punctuated.prefix)),
CompactionRule::Conflict,
)
}
}
Some(position) => {
let recapitalised_prefix = restore_capitalisation(
prefix_reduction.fingerprint,
prefix_reduction.leading_capital,
prefix_reduction.trailing_capitals != 0,
);
((position, recapitalised_prefix), CompactionRule::InDict)
}
}
}
}
};
EncodedWord::new(encoded_prefix.0, encoded_prefix.1 + &punctuated.suffix)
}
fn restore_capitalisation(
lowercase_word: String,
leading_capital: bool,
nonleading_capital: bool,
) -> String {
if nonleading_capital {
lowercase_word.to_uppercase()
} else if leading_capital {
let mut chars = lowercase_word.chars();
chars.next().unwrap().to_uppercase().to_string() + chars.as_str()
} else {
lowercase_word
}
}
const ESCAPE: u8 = b'\\';
fn expand_word(dict: &Dict, word: EncodedWord) -> Result<String, WordResolveError> {
let punctuated = PunctuatedWord::from(word.body.as_str());
if punctuated.prefix.is_empty() {
return Ok(word.body);
}
let recapitalised_prefix = if punctuated.prefix.as_bytes()[0] == ESCAPE {
String::from(&punctuated.prefix[1..punctuated.prefix.len()])
} else {
let mut chars = punctuated.prefix.chars();
let leading_capital = chars.next().unwrap().is_uppercase();
let nonleading_capital = chars.next().map_or(false, char::is_uppercase);
if contains_vowels(&punctuated.prefix) {
punctuated.prefix.into_owned()
} else {
let lowercase_word = punctuated.prefix.to_lowercase();
match dict.resolve(&lowercase_word, word.leading_spaces)? {
None => {
punctuated.prefix.into_owned()
}
Some(resolved) => {
restore_capitalisation(resolved.clone(), leading_capital, nonleading_capital)
}
}
}
};
Ok(recapitalised_prefix + &punctuated.suffix)
}
fn contains_vowels(text: &str) -> bool {
text.chars().any(is_vowel)
}
#[cfg(test)]
mod tests;