pub mod cognate;
pub mod swadesh;
use std::borrow::Cow;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct LexEntry {
pub word: Cow<'static, str>,
pub ipa: Cow<'static, str>,
pub gloss: Cow<'static, str>,
pub pos: PartOfSpeech,
pub frequency_rank: Option<u32>,
pub swadesh_index: Option<u16>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum PartOfSpeech {
Noun,
Verb,
Adjective,
Adverb,
Pronoun,
Preposition,
Conjunction,
Interjection,
Determiner,
Particle,
Numeral,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Lexicon {
pub language_code: Cow<'static, str>,
pub entries: Vec<LexEntry>,
}
impl Lexicon {
#[must_use]
pub fn find(&self, word: &str) -> Option<&LexEntry> {
tracing::trace!(language = %self.language_code, word, "lexicon lookup");
self.entries.iter().find(|e| e.word == word)
}
#[must_use]
pub fn swadesh(&self) -> Vec<&LexEntry> {
tracing::trace!(language = %self.language_code, "extracting Swadesh list");
let mut result: Vec<_> = self
.entries
.iter()
.filter(|e| e.swadesh_index.is_some())
.collect();
result.sort_by_key(|e| e.swadesh_index.unwrap_or(u16::MAX));
result
}
#[must_use]
pub fn most_frequent(&self, n: usize) -> Vec<&LexEntry> {
tracing::trace!(language = %self.language_code, n, "frequency ranking");
let mut ranked: Vec<_> = self
.entries
.iter()
.filter(|e| e.frequency_rank.is_some())
.collect();
ranked.sort_by_key(|e| e.frequency_rank.unwrap_or(u32::MAX));
ranked.into_iter().take(n).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_lexicon() -> Lexicon {
Lexicon {
language_code: Cow::Borrowed("en"),
entries: vec![
LexEntry {
word: Cow::Borrowed("water"),
ipa: Cow::Borrowed("ˈwɔːtər"),
gloss: Cow::Borrowed("water"),
pos: PartOfSpeech::Noun,
frequency_rank: Some(250),
swadesh_index: Some(1),
},
LexEntry {
word: Cow::Borrowed("fire"),
ipa: Cow::Borrowed("ˈfaɪər"),
gloss: Cow::Borrowed("fire"),
pos: PartOfSpeech::Noun,
frequency_rank: Some(800),
swadesh_index: Some(2),
},
LexEntry {
word: Cow::Borrowed("the"),
ipa: Cow::Borrowed("ðə"),
gloss: Cow::Borrowed("the (definite article)"),
pos: PartOfSpeech::Determiner,
frequency_rank: Some(1),
swadesh_index: None,
},
],
}
}
#[test]
fn test_find_word() {
let lex = sample_lexicon();
assert!(lex.find("water").is_some());
assert!(lex.find("xyz").is_none());
}
#[test]
fn test_swadesh_list() {
let lex = sample_lexicon();
let sw = lex.swadesh();
assert_eq!(sw.len(), 2);
assert_eq!(sw[0].word, "water");
}
#[test]
fn test_most_frequent() {
let lex = sample_lexicon();
let freq = lex.most_frequent(2);
assert_eq!(freq[0].word, "the");
assert_eq!(freq[1].word, "water");
}
#[test]
fn test_find_empty_lexicon() {
let lex = Lexicon {
language_code: Cow::Borrowed("xx"),
entries: vec![],
};
assert!(lex.find("anything").is_none());
assert!(lex.swadesh().is_empty());
assert!(lex.most_frequent(10).is_empty());
}
#[test]
fn test_lex_entry_eq() {
let a = LexEntry {
word: Cow::Borrowed("cat"),
ipa: Cow::Borrowed("kæt"),
gloss: Cow::Borrowed("cat"),
pos: PartOfSpeech::Noun,
frequency_rank: Some(500),
swadesh_index: None,
};
let b = a.clone();
assert_eq!(a, b);
}
}