fn main() {
#[cfg(feature = "german")]
natural_languages::generate_word_lists();
}
#[cfg(feature = "german")]
mod natural_languages {
use std::io::{BufReader, BufWriter};
use std::{
env,
fs::{self, File},
path::Path,
};
pub fn generate_word_lists() {
let base_source_path = Path::new("data/word-lists");
let out_dir = env::var_os("OUT_DIR").unwrap();
let base_destination_path = Path::new(&out_dir);
{
let source_file = base_source_path.join("de.txt");
let destination_file = base_destination_path.join("de.fst");
destination_file.parent().map(fs::create_dir_all);
german::process_german(
&mut BufReader::new(File::open(&source_file).unwrap()),
&mut BufWriter::new(File::create(destination_file).unwrap()),
);
println!("cargo:rerun-if-changed={}", source_file.display());
}
}
#[cfg(feature = "german")]
mod german {
use decompound::{decompound, DecompositionOptions};
use rayon::prelude::*;
use std::collections::HashSet;
use std::env;
use std::io::{BufReader, BufWriter, Read, Write};
use std::sync::Mutex;
macro_rules! time_it {
($name:expr, $e:expr) => {{
let now = std::time::Instant::now();
let result = $e;
let duration = now.elapsed();
println!("{} - Time taken: {:?}", $name, duration);
result
}};
}
pub fn process_german<R, W>(source: &mut BufReader<R>, destination: &mut BufWriter<W>)
where
R: Read,
W: Write,
{
let mut contents = String::new();
source.read_to_string(&mut contents).unwrap();
let words: HashSet<&str> = time_it!(
"Constructing hashset of words",
contents.lines().map(|word| word.trim()).collect()
);
let keepers = Mutex::new(Vec::new());
time_it!(
"Filtering words",
words.par_iter().for_each(|word| {
match decompound(
word,
&|w| words.contains(w),
DecompositionOptions::TRY_TITLECASE_SUFFIX,
) {
Ok(_constituents) => {
}
Err(_) => {
let mut keepers = keepers.lock().unwrap();
keepers.push(word.to_owned());
}
};
})
);
let mut keepers = keepers.into_inner().unwrap();
let dropped_words: HashSet<_> = words
.difference(&keepers.iter().cloned().collect::<HashSet<_>>())
.cloned()
.collect();
drop(words);
println!(
"cargo:warning=Dropped {} compound words ({} remaining); see '{:?}' for a list.",
dropped_words.len(),
keepers.len(),
{
let mut path: std::path::PathBuf = env::var_os("OUT_DIR").unwrap().into();
path.pop(); path.push("output"); path
},
);
time_it!("Sorting filtered words", keepers.sort());
time_it!("Deduplicating filtered words", keepers.dedup());
time_it!("Building FST", {
let mut build = fst::SetBuilder::new(destination).unwrap();
for word in &keepers {
build.insert(word).unwrap();
}
build.finish().unwrap();
});
}
}
}