use std::collections::BTreeMap;
use crate::conlang::types::morphology::Morphology;
use crate::conlang::Phonology;
use crate::language_entry::DictionaryEntry;
pub const BUNDLED_POOL: &str = include_str!("../../../assets/conlang/english-pool-v1.txt");
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Rejected {
pub english: String,
pub unresolved: Vec<String>,
}
#[derive(Debug, Clone, Default)]
pub struct CorpusReport {
pub scanned: usize,
pub accepted: Vec<(String, String)>,
pub rejected: Vec<Rejected>,
}
impl CorpusReport {
pub fn acceptance_rate(&self) -> f32 {
if self.scanned == 0 {
0.0
} else {
self.accepted.len() as f32 / self.scanned as f32
}
}
pub fn top_missing(&self, n: usize) -> Vec<(String, usize)> {
let mut counts: BTreeMap<String, usize> = BTreeMap::new();
for r in &self.rejected {
for w in &r.unresolved {
*counts.entry(w.clone()).or_default() += 1;
}
}
let mut v: Vec<(String, usize)> = counts.into_iter().collect();
v.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
v.truncate(n);
v
}
}
pub fn parse_pool(text: &str) -> Vec<String> {
text.lines()
.map(str::trim)
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.map(str::to_string)
.collect()
}
pub fn generate(
phon: &Phonology,
morph: &Morphology,
typology: &BTreeMap<String, String>,
entries: &[DictionaryEntry],
pool: &[String],
) -> CorpusReport {
let mut report = CorpusReport::default();
for en in pool {
let en = en.trim();
if en.is_empty() {
continue;
}
report.scanned += 1;
let t = super::translate(phon, morph, typology, entries, en);
if t.unresolved.is_empty() && !t.target.trim().is_empty() {
report.accepted.push((en.to_string(), t.target));
} else {
report.rejected.push(Rejected { english: en.to_string(), unresolved: t.unresolved });
}
}
report
}
#[cfg(test)]
mod tests {
use super::*;
fn entry(word: &str, pos: &str, translation: &str) -> DictionaryEntry {
DictionaryEntry {
word: word.into(),
pos: pos.into(),
translation: translation.into(),
..Default::default()
}
}
#[test]
fn bundled_pool_parses() {
let pool = parse_pool(BUNDLED_POOL);
assert!(pool.len() > 50, "pool should have many sentences, got {}", pool.len());
assert!(pool.iter().all(|s| !s.starts_with('#') && !s.is_empty()));
}
#[test]
fn accepts_covered_rejects_uncovered() {
let phon = Phonology::default();
let morph = Morphology::default();
let mut typ = BTreeMap::new();
typ.insert("word_order".to_string(), "svo".to_string());
let entries = vec![
entry("kira", "noun", "bird"),
entry("nami", "verb", "to see"),
entry("pata", "noun", "stone"),
];
let pool = vec![
"the bird sees the stone".to_string(), "the dragon sees the stone".to_string(), ];
let r = generate(&phon, &morph, &typ, &entries, &pool);
assert_eq!(r.scanned, 2);
assert_eq!(r.accepted, vec![("the bird sees the stone".to_string(), "kira nami pata".to_string())]);
assert_eq!(r.rejected.len(), 1);
assert_eq!(r.acceptance_rate(), 0.5);
assert_eq!(r.top_missing(3), vec![("dragon".to_string(), 1)]);
}
}