use std::collections::BTreeSet;
use std::sync::OnceLock;
use super::parser::{parse_lino, LinoNode};
use super::roles::{ROLE_ONTOLOGY_CATEGORY, ROLE_ONTOLOGY_ROOT, ROLE_ONTOLOGY_TYPE};
use super::MEANING_FILES;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Slot {
Bare,
Prefix,
Suffix,
Circumfix,
}
#[derive(Debug, Clone)]
pub struct WordForm {
pub text: String,
pub description: String,
pub action: String,
}
impl WordForm {
#[must_use]
pub fn slot(&self) -> Slot {
match self.text.split_once('…') {
None => Slot::Bare,
Some((before, after)) => match (!before.is_empty(), !after.is_empty()) {
(true, true) => Slot::Circumfix,
(true, false) => Slot::Prefix,
(false, true) => Slot::Suffix,
(false, false) => Slot::Bare,
},
}
}
#[must_use]
pub fn before_slot(&self) -> &str {
match self.text.split_once('…') {
Some((before, _)) => before,
None => &self.text,
}
}
#[must_use]
pub fn after_slot(&self) -> &str {
match self.text.split_once('…') {
Some((_, after)) => after,
None => "",
}
}
}
#[derive(Debug, Clone)]
pub struct Lexeme {
pub language: String,
pub words: Vec<WordForm>,
}
#[derive(Debug, Clone)]
pub struct Meaning {
pub slug: String,
pub gloss: String,
pub wiktionary: String,
pub wikidata: String,
pub defined_by: Vec<String>,
pub roles: Vec<String>,
pub lexemes: Vec<Lexeme>,
}
impl Meaning {
#[must_use]
pub fn has_role(&self, role: &str) -> bool {
self.roles.iter().any(|r| r == role)
}
pub fn words(&self) -> impl Iterator<Item = &str> {
self.lexemes
.iter()
.flat_map(|lexeme| lexeme.words.iter().map(|w| w.text.as_str()))
}
pub fn word_forms(&self) -> impl Iterator<Item = &WordForm> {
self.lexemes.iter().flat_map(|lexeme| lexeme.words.iter())
}
#[must_use]
pub fn evidenced_in(&self, normalized: &str) -> bool {
self.words().any(|word| surface_present(normalized, word))
}
#[must_use]
pub fn word_in(&self, language: &str) -> Option<&str> {
self.lexemes
.iter()
.find(|lexeme| lexeme.language == language)
.and_then(|lexeme| lexeme.words.first().map(|w| w.text.as_str()))
}
#[must_use]
pub fn describe_word(&self, word: &str) -> Option<&str> {
self.word_forms()
.find(|form| form.text.eq_ignore_ascii_case(word))
.map(|form| form.description.as_str())
}
#[must_use]
pub fn languages(&self) -> BTreeSet<String> {
self.lexemes.iter().map(|l| l.language.clone()).collect()
}
#[must_use]
pub fn mentions_in_languages_raw(&self, normalized: &str, languages: &[&str]) -> bool {
self.lexemes
.iter()
.filter(|lexeme| languages.contains(&lexeme.language.as_str()))
.flat_map(|lexeme| lexeme.words.iter())
.any(|word| !word.text.is_empty() && normalized.contains(word.text.as_str()))
}
fn lexeme_lists(&self, language: &str, surface: &str) -> bool {
self.lexemes
.iter()
.filter(|lexeme| lexeme.language == language)
.flat_map(|lexeme| lexeme.words.iter())
.any(|word| word.text == surface)
}
fn lexeme_lists_action(&self, language: &str, surface: &str, action: &str) -> bool {
self.lexemes
.iter()
.filter(|lexeme| lexeme.language == language)
.flat_map(|lexeme| lexeme.words.iter())
.any(|word| word.text == surface && word.action == action)
}
}
pub type WordValueTable = Vec<(String, String)>;
#[derive(Debug, Clone, Default)]
pub struct Lexicon {
pub meanings: Vec<Meaning>,
}
impl Lexicon {
#[must_use]
pub fn meaning(&self, slug: &str) -> Option<&Meaning> {
self.meanings.iter().find(|m| m.slug == slug)
}
#[must_use]
pub fn meaning_by_wikidata(&self, id: &str) -> Option<&Meaning> {
self.meanings.iter().find(|m| m.wikidata == id)
}
pub fn meanings_with_role<'a>(&'a self, role: &'a str) -> impl Iterator<Item = &'a Meaning> {
self.meanings.iter().filter(move |m| m.has_role(role))
}
#[must_use]
pub fn role_word_forms<'a>(&'a self, role: &str) -> Vec<&'a WordForm> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.flat_map(Meaning::word_forms)
.collect()
}
#[must_use]
pub fn role_surface_translation<'a>(
&'a self,
role: &str,
source: &str,
target: &str,
surface: &str,
) -> Option<&'a str> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.lexeme_lists(source, surface))
.and_then(|meaning| meaning.word_in(target))
}
#[must_use]
pub fn role_lists_surface(&self, role: &str, language: &str, surface: &str) -> bool {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.any(|meaning| meaning.lexeme_lists(language, surface))
}
#[must_use]
pub fn role_action_surface_translation<'a>(
&'a self,
role: &str,
action: &str,
source: &str,
target: &str,
surface: &str,
) -> Option<&'a str> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.lexeme_lists_action(source, surface, action))
.and_then(|meaning| meaning.word_in(target))
}
#[must_use]
pub fn words_for_role(&self, role: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
for meaning in self.meanings.iter().filter(|m| m.has_role(role)) {
for word in meaning.words() {
if !out.iter().any(|existing| existing == word) {
out.push(word.to_string());
}
}
}
out
}
#[must_use]
pub fn mentions_role(&self, role: &str, normalized: &str) -> bool {
self.meanings_with_role(role)
.any(|meaning| meaning.evidenced_in(normalized))
}
#[must_use]
pub fn mentions_role_spelled(&self, role: &str, normalized: &str) -> bool {
self.meanings_with_role(role).any(|meaning| {
meaning
.words()
.filter(|word| word.chars().any(char::is_alphabetic))
.any(|word| surface_present(normalized, word))
})
}
#[must_use]
pub fn mentions_role_raw(&self, role: &str, normalized: &str) -> bool {
self.meanings_with_role(role)
.any(|meaning| meaning.words().any(|word| normalized.contains(word)))
}
#[must_use]
pub fn arithmetic_normalization_tables(&self) -> (WordValueTable, WordValueTable) {
let is_value_surface = |word: &str| !word.chars().any(char::is_alphabetic);
let mut tokens: WordValueTable = Vec::new();
let mut phrases: WordValueTable = Vec::new();
for role in [
super::roles::ROLE_CARDINAL_NUMBER_WORD,
super::roles::ROLE_ARITHMETIC_OPERATOR_WORD,
] {
for meaning in self.meanings_with_role(role) {
let Some(value) = meaning.words().find(|&word| is_value_surface(word)) else {
continue;
};
for word in meaning.words() {
if word == value || is_value_surface(word) {
continue;
}
let entry = (word.to_string(), value.to_string());
if word.chars().any(char::is_whitespace) {
phrases.push(entry);
} else {
tokens.push(entry);
}
}
}
}
tokens.sort();
tokens.dedup();
phrases.sort_by(|a, b| {
b.0.chars()
.count()
.cmp(&a.0.chars().count())
.then_with(|| a.0.cmp(&b.0))
});
phrases.dedup();
(tokens, phrases)
}
#[must_use]
pub fn words_for_role_in_languages(&self, role: &str, languages: &[&str]) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
for meaning in self.meanings_with_role(role) {
for lexeme in &meaning.lexemes {
if !languages.contains(&lexeme.language.as_str()) {
continue;
}
for word in &lexeme.words {
if !out.iter().any(|existing| existing == &word.text) {
out.push(word.text.clone());
}
}
}
}
out
}
#[must_use]
pub fn first_role_language(
&self,
role: &str,
normalized: &str,
priority: &[&'static str],
) -> Option<&'static str> {
priority.iter().copied().find(|&lang| {
self.meanings_with_role(role).any(|meaning| {
meaning
.lexemes
.iter()
.filter(|lexeme| lexeme.language == lang)
.any(|lexeme| {
lexeme
.words
.iter()
.any(|word| normalized.contains(word.text.as_str()))
})
})
})
}
#[must_use]
pub fn first_role_match(&self, role: &str, normalized: &str) -> Option<&Meaning> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.evidenced_in(normalized))
}
#[must_use]
pub fn first_role_match_in_languages_raw(
&self,
role: &str,
normalized: &str,
languages: &[&str],
) -> Option<&Meaning> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.mentions_in_languages_raw(normalized, languages))
}
#[must_use]
pub fn mentions_role_in_languages_raw(
&self,
role: &str,
normalized: &str,
languages: &[&str],
) -> bool {
self.meanings_with_role(role)
.any(|meaning| meaning.mentions_in_languages_raw(normalized, languages))
}
#[must_use]
pub fn ontology_root(&self) -> Option<&Meaning> {
self.meanings
.iter()
.find(|m| m.has_role(ROLE_ONTOLOGY_ROOT))
}
#[must_use]
pub fn reaches_root(&self, slug: &str) -> bool {
let Some(root) = self.ontology_root() else {
return false;
};
let mut seen: BTreeSet<&str> = BTreeSet::new();
let mut stack: Vec<&str> = vec![slug];
while let Some(current) = stack.pop() {
if current == root.slug {
return true;
}
if !seen.insert(current) {
continue;
}
if let Some(meaning) = self.meaning(current) {
for target in &meaning.defined_by {
stack.push(target.as_str());
}
}
}
false
}
#[must_use]
pub fn ontology_type_root(&self) -> Option<&Meaning> {
self.meanings
.iter()
.find(|m| m.has_role(ROLE_ONTOLOGY_TYPE))
}
pub fn ontology_categories(&self) -> impl Iterator<Item = &Meaning> {
self.meanings_with_role(ROLE_ONTOLOGY_CATEGORY)
}
}
fn surface_present(normalized: &str, expected: &str) -> bool {
if expected.is_empty() {
return false;
}
if crate::coding::contains_cjk(expected) {
return normalized.contains(expected);
}
normalized == expected
|| normalized.starts_with(&format!("{expected} "))
|| normalized.ends_with(&format!(" {expected}"))
|| normalized.contains(&format!(" {expected} "))
}
fn parse_lexicon(text: &str) -> Lexicon {
let root = parse_lino(text);
let mut meanings = Vec::new();
let containers: Vec<&LinoNode> = root
.children
.iter()
.filter(|c| c.name == "meanings")
.collect();
let sources: Vec<&LinoNode> = if containers.is_empty() {
vec![&root]
} else {
containers
};
for container in sources {
for node in container.children.iter().filter(|c| c.name == "meaning") {
meanings.push(parse_meaning(node));
}
}
Lexicon { meanings }
}
fn parse_meaning(node: &LinoNode) -> Meaning {
let mut defined_by = Vec::new();
let mut roles = Vec::new();
let mut lexemes = Vec::new();
for child in &node.children {
match child.name.as_str() {
"defined_by" => defined_by.push(child.id.clone()),
"role" => roles.push(child.id.clone()),
"lexeme" => {
let words = child
.children
.iter()
.filter(|w| w.name == "word")
.map(|w| WordForm {
text: w.id.clone(),
description: w.find_child_value("description").to_string(),
action: w.find_child_value("action").to_string(),
})
.collect();
lexemes.push(Lexeme {
language: child.id.clone(),
words,
});
}
_ => {}
}
}
Meaning {
slug: node.id.clone(),
gloss: node.find_child_value("gloss").to_string(),
wiktionary: node.find_child_value("wiktionary").to_string(),
wikidata: node.find_child_value("wikidata").to_string(),
defined_by,
roles,
lexemes,
}
}
#[must_use]
pub fn lexicon() -> &'static Lexicon {
static CACHE: OnceLock<Lexicon> = OnceLock::new();
CACHE.get_or_init(|| parse_lexicon(&MEANING_FILES.join("\n")))
}