use crate::language_entry::DictionaryEntry;
struct Sense {
word: String,
pos: String,
tokens: Vec<String>,
}
pub struct GlossIndex {
senses: Vec<Sense>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Mapping {
Found { word: String, pos: String },
Missing,
}
fn content_tokens(s: &str) -> Vec<String> {
s.split(|c: char| !c.is_alphanumeric())
.filter(|w| !w.is_empty())
.map(|w| w.to_lowercase())
.filter(|w| !is_stopword(w))
.collect()
}
fn is_stopword(t: &str) -> bool {
matches!(
t,
"to" | "the" | "a" | "an" | "le" | "la" | "les" | "un" | "une" | "des" | "du" | "de" | "der" | "die" | "das" | "ein" | "eine" | "el" | "los" | "las" | "una" | "unos" | "unas" )
}
fn pos_is(pos: &str, want: PosHint) -> bool {
let p = pos.to_lowercase();
match want {
PosHint::Noun => p.starts_with('n'), PosHint::Verb => p.starts_with('v'), PosHint::Adjective => p.starts_with("adj"), }
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PosHint {
Noun,
Verb,
Adjective,
}
impl GlossIndex {
pub fn build(entries: &[DictionaryEntry]) -> Self {
let senses = entries
.iter()
.filter(|e| !e.word.trim().is_empty() && !e.translation.trim().is_empty())
.map(|e| Sense {
word: e.word.clone(),
pos: e.pos.clone(),
tokens: content_tokens(&e.translation),
})
.collect();
GlossIndex { senses }
}
pub fn has_sense(&self, lemma: &str, hint: PosHint) -> bool {
let needle = lemma.to_lowercase();
self.senses.iter().any(|s| pos_is(&s.pos, hint) && s.tokens.iter().any(|t| t == &needle))
}
pub fn map(&self, lemma: &str, hint: PosHint) -> Mapping {
let needle = lemma.to_lowercase();
if let Some(s) = self.senses.iter().find(|s| {
pos_is(&s.pos, hint) && s.tokens.len() == 1 && s.tokens[0] == needle
}) {
return Mapping::Found { word: s.word.clone(), pos: s.pos.clone() };
}
if let Some(s) =
self.senses.iter().find(|s| pos_is(&s.pos, hint) && s.tokens.iter().any(|t| t == &needle))
{
return Mapping::Found { word: s.word.clone(), pos: s.pos.clone() };
}
if let Some(s) = self
.senses
.iter()
.find(|s| s.tokens.len() == 1 && s.tokens[0] == needle)
.or_else(|| self.senses.iter().find(|s| s.tokens.iter().any(|t| t == &needle)))
{
return Mapping::Found { word: s.word.clone(), pos: s.pos.clone() };
}
Mapping::Missing
}
}
#[cfg(test)]
mod tests {
use super::*;
fn entry(word: &str, pos: &str, translation: &str) -> DictionaryEntry {
DictionaryEntry {
word: word.into(),
pos: pos.into(),
translation: translation.into(),
..Default::default()
}
}
#[test]
fn maps_by_gloss_with_pos_hint() {
let entries = vec![
entry("kira", "noun", "bird"),
entry("nami", "verb", "to see"),
entry("pata", "noun", "stone"),
];
let idx = GlossIndex::build(&entries);
assert_eq!(
idx.map("bird", PosHint::Noun),
Mapping::Found { word: "kira".into(), pos: "noun".into() }
);
assert_eq!(
idx.map("see", PosHint::Verb),
Mapping::Found { word: "nami".into(), pos: "verb".into() }
);
assert_eq!(idx.map("dragon", PosHint::Noun), Mapping::Missing);
}
#[test]
fn pos_hint_breaks_homograph_ties() {
let entries = vec![
entry("móru", "noun", "water"),
entry("móruta", "verb", "to water"),
];
let idx = GlossIndex::build(&entries);
assert_eq!(
idx.map("water", PosHint::Verb),
Mapping::Found { word: "móruta".into(), pos: "verb".into() }
);
assert_eq!(
idx.map("water", PosHint::Noun),
Mapping::Found { word: "móru".into(), pos: "noun".into() }
);
}
}