use std::sync::LazyLock;
use cached::SizedCache;
use cached::proc_macro::cached;
use decompound::{DecompositionOptions, decompound};
use itertools::Itertools;
use itertools::MinMaxResult::{MinMax, NoElements, OneElement};
use log::{debug, trace};
use unicode_titlecase::StrTitleCase;
use crate::actions::Action;
use crate::actions::german::machine::{StateMachine, Transition};
use crate::actions::german::words::{Replace, Replacement, WordCasing};
#[derive(Debug, Clone, Copy)]
pub struct German {
prefer_original: bool,
naive: bool,
}
impl German {
#[must_use]
pub const fn new(prefer_original: bool, naive: bool) -> Self {
Self {
prefer_original,
naive,
}
}
#[allow(clippy::missing_const_for_fn)] pub fn prefer_original(&mut self) -> &mut Self {
self.prefer_original = true;
self
}
#[allow(clippy::missing_const_for_fn)] pub fn prefer_replacement(&mut self) -> &mut Self {
self.prefer_original = false;
self
}
#[allow(clippy::missing_const_for_fn)] pub fn naive(&mut self) -> &mut Self {
self.naive = true;
self
}
#[allow(clippy::missing_const_for_fn)] pub fn sophisticated(&mut self) -> &mut Self {
self.naive = false;
self
}
}
impl Default for German {
fn default() -> Self {
let prefer_original = false;
let naive = false;
Self::new(prefer_original, naive)
}
}
impl Action for German {
fn act(&self, input: &str) -> String {
const INDICATOR: char = '\0';
let mut output = String::with_capacity(input.len());
let mut machine = StateMachine::new();
for char in input.chars().chain(std::iter::once(INDICATOR)) {
trace!(
"Beginning processing of character '{}'",
char.escape_debug()
);
let transition = machine.transition(char);
trace!("Transition is '{transition:?}'");
match transition {
Transition::External => {
output.push(char);
}
Transition::Entered | Transition::Internal => { }
Transition::Exited => {
debug!("Exited machine: {machine:?}");
let original = machine.current_word().content().to_owned();
let word = find_valid_replacement(
&original,
machine.current_word().replacements(),
self.prefer_original,
self.naive,
)
.unwrap_or(original);
debug!("Processed word, appending to output: {:?}", &word);
output.push_str(&word);
output.push(char);
}
}
}
let c = output.pop();
debug_assert!(
c == Some(INDICATOR),
"Trailing indicator byte expected, but found '{c:?}'."
);
debug!("Final output string is '{}'", output.escape_debug());
output
}
}
fn find_valid_replacement(
word: &str,
replacements: &[Replacement],
prefer_original: bool,
naive: bool,
) -> Option<String> {
let replacement_combinations = {
let mut res: Vec<Vec<_>> = replacements
.iter()
.powerset()
.map(|v| v.into_iter().cloned().collect())
.collect();
if naive {
res = match res.into_iter().minmax_by_key(Vec::len) {
NoElements => {
unreachable!("powerset always contains at least the empty set")
}
OneElement(e) => vec![e],
MinMax(min, max) => vec![min, max],
};
}
res
};
debug!("Starting search for valid replacement for word '{word}'");
trace!("All replacement combinations to try: {replacement_combinations:?}");
debug_assert!(replacement_combinations.first().is_none_or(Vec::is_empty));
let n_skip = (!prefer_original).into();
for replacements in replacement_combinations.into_iter().skip(n_skip) {
let mut candidate = word.to_owned();
candidate.apply_replacements(replacements);
trace!("Replaced candidate word, now is: '{candidate}'. Starting validity check.");
if naive || is_valid(&candidate, &contained_in_global_word_list) {
debug!("Candidate '{candidate}' is valid, returning early");
return Some(candidate);
}
trace!("Candidate '{candidate}' is invalid, trying next one");
}
debug!("No valid replacement found, returning");
None
}
static SET: LazyLock<fst::Set<&[u8]>> = LazyLock::new(|| {
let bytes: &'static [u8] = include_bytes!(concat!(env!("OUT_DIR"), "/de.fst")); trace!("Loading FST.");
let set = fst::Set::new(bytes).expect("Failed to load FST; FST bytes malformed at build time?");
trace!("Done loading FST.");
set
});
fn contained_in_global_word_list(word: &str) -> bool {
trace!("Performing lookup of '{word}' in FST.");
let result = SET.contains(word);
trace!("Done performing word lookup in FST (got '{result}').");
result
}
#[cached(
ty = "SizedCache<String, bool>",
create = "{ SizedCache::with_size(256) }",
convert = r#"{ String::from(word) }"#
)]
fn is_valid(word: &str, predicate: &impl Fn(&str) -> bool) -> bool {
trace!("Trying candidate '{word}'");
let casing = WordCasing::try_from(word);
trace!("Casing of candidate is '{casing:?}'");
match casing {
Ok(WordCasing::AllLowercase) => {
predicate(word)
|| decompound(word, predicate, DecompositionOptions::TRY_TITLECASE_SUFFIX).is_ok()
}
Ok(WordCasing::AllUppercase) => {
let tc = word.to_titlecase_lower_rest();
debug_assert!(
WordCasing::try_from(tc.as_str()) == Ok(WordCasing::Titlecase),
"Titlecased word, but isn't categorized correctly."
);
is_valid(&tc, predicate)
}
Ok(WordCasing::Mixed) => {
match word.chars().next() {
Some(c) if c.is_uppercase() => {
let tc = word.to_titlecase_lower_rest();
debug_assert!(
WordCasing::try_from(tc.as_str()) == Ok(WordCasing::Titlecase),
"Titlecased word, but isn't categorized correctly."
);
is_valid(&tc, predicate)
}
_ => is_valid(&word.to_lowercase(), predicate),
}
}
Ok(WordCasing::Titlecase) => {
predicate(word)
|| is_valid(&word.to_lowercase(), predicate)
|| decompound(word, predicate, DecompositionOptions::TRY_TITLECASE_SUFFIX).is_ok()
}
Err(_) => false, }
}
#[cfg(test)]
mod tests {
use rstest::rstest;
use super::*;
#[test]
fn test_word_list_is_not_filtered() {
let mut stream = SET.stream();
assert!(
{
let mut has_any_ascii = false;
while let Some(key) = fst::Streamer::next(&mut stream) {
if key.is_ascii() {
has_any_ascii = true;
break;
}
}
has_any_ascii
},
concat!(
"Looks like you're using a filtered word list containing only special characters.",
" The current implementation relies on the full word list (also containing all non-Umlaut words)"
)
);
}
#[test]
fn test_is_valid_on_empty_input() {
assert!(!is_valid("", &contained_in_global_word_list));
}
#[rstest]
#[case("Koeffizient", true)]
#[case("kongruent", true)]
#[case("Mauer", true)]
#[case("dröge", true)]
#[case("Kübel", true)]
#[case("DüBeL", true)] #[case("düBeL", false)] #[case("dröGE", true)] #[case("DrÖgE", true)] #[case("????", false)]
#[case("\0", false)]
#[case("\0Dübel", false)]
#[case("Dübel\0", false)]
#[case("\0Dübel\0", false)]
#[case("🤩Dübel", false)]
#[case("🤩Dübel🤐", false)]
#[case("😎", false)]
#[case("Mauer😂", false)]
#[case("Duebel", false)]
#[case("Maür", false)]
#[case("Maürdübelkübel", false)]
#[case("Messgerät", true)]
#[case("messgerät", false)]
#[case("Mauerdübel", true)]
#[case("Mauerdübelkübel", true)]
#[case("Süßwasserschwimmbäder", true)]
#[case("kindergarten", false)]
#[case("Kindergarten", true)] #[case("No\nway", false)]
#[case("مرحبا", false)]
#[case("你好", false)]
fn test_is_valid(#[case] word: &str, #[case] expected: bool) {
assert_eq!(is_valid(word, &contained_in_global_word_list), expected);
}
#[rstest]
#[case("Dübel", "Dübel")]
#[case("\0Kuebel", "\0Kübel")]
#[case("\0Duebel\0", "\0Dübel\0")]
#[case("🤩Duebel", "🤩Dübel")]
#[case("🤩Duebel🤐", "🤩Dübel🤐")]
#[case("Abenteuer sind toll!", "Abenteuer sind toll!")]
#[case("Koeffizient", "Koeffizient")]
#[case("kongruent", "kongruent")]
#[case(
"Ich mag Aepfel, aber nicht Aerger.",
"Ich mag Äpfel, aber nicht Ärger."
)]
#[case("Ich mag AEPFEL!! 😍", "Ich mag ÄPFEL!! 😍")]
#[case("Wer mag Aepfel?!", "Wer mag Äpfel?!")]
#[case("Was sind aepfel?", "Was sind aepfel?")] #[case("WARUM SCHLIESSEN WIR NICHT AB?", "WARUM SCHLIEẞEN WIR NICHT AB?")]
#[case("Wir schliessen nicht ab.", "Wir schließen nicht ab.")]
#[case("WiR sChLieSsEn ab!", "WiR sChLieẞEn ab!")]
#[case("WiR sChLiesSEn vieLleEcHt aB.", "WiR sChLießEn vieLleEcHt aB.")]
#[case("Suess!", "Süß!")]
#[case(
"Oel ist ein wichtiger Bestandteil von Oel.",
"Öl ist ein wichtiger Bestandteil von Öl."
)]
#[case(
"\0Schoener 你好 Satz... 👋🏻\r\n\n",
"\0Schöner 你好 Satz... 👋🏻\r\n\n"
)]
fn test_substitution(#[case] input: &str, #[case] expected: &str) {
let action = German::default();
let result = action.act(input);
assert_eq!(result, expected);
}
#[rstest]
#[case("ue", "ü")]
#[case("uE", "ü")]
#[case("Ue", "Ü")]
#[case("UE", "Ü")]
#[case("uekol", "ükol")]
#[case("uEkol", "ükol")]
#[case("Uekol", "Ükol")]
#[case("UEkol", "Ükol")]
#[case("guessa", "güßa")]
#[case("gUessa", "gÜßa")]
#[case("guEssa", "güßa")]
#[case("gUEssa", "gÜßa")]
#[case("Guessa", "Güßa")]
#[case("GUESSA", "GÜẞA")]
fn test_casing_when_being_naive(#[case] input: &str, #[case] expected: &str) {
let mut action = German::default();
action.naive();
let result = action.act(input);
assert_eq!(result, expected);
}
}