use std::collections::BTreeSet;
use std::sync::OnceLock;
use super::parser::{decode_codepoints, parse_lino, LinoNode};
use super::roles::{ROLE_ONTOLOGY_CATEGORY, ROLE_ONTOLOGY_ROOT, ROLE_ONTOLOGY_TYPE};
use super::MEANING_FILES;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Slot {
Bare,
Prefix,
Suffix,
Circumfix,
}
#[derive(Debug, Clone)]
pub struct WordForm {
pub text: String,
pub description: String,
pub action: String,
pub semantic_facets: Vec<SemanticFacet>,
}
impl WordForm {
#[must_use]
pub fn slot(&self) -> Slot {
match self.text.split_once('…') {
None => Slot::Bare,
Some((before, after)) => match (!before.is_empty(), !after.is_empty()) {
(true, true) => Slot::Circumfix,
(true, false) => Slot::Prefix,
(false, true) => Slot::Suffix,
(false, false) => Slot::Bare,
},
}
}
#[must_use]
pub fn before_slot(&self) -> &str {
match self.text.split_once('…') {
Some((before, _)) => before,
None => &self.text,
}
}
#[must_use]
pub fn after_slot(&self) -> &str {
match self.text.split_once('…') {
Some((_, after)) => after,
None => "",
}
}
pub fn semantic_facet_targets<'a>(&'a self, kind: &'a str) -> impl Iterator<Item = &'a str> {
self.semantic_facets
.iter()
.filter(move |facet| facet.kind == kind)
.flat_map(|facet| facet.meanings.iter().map(String::as_str))
}
}
#[derive(Debug, Clone)]
pub struct Lexeme {
pub language: String,
pub words: Vec<WordForm>,
}
#[derive(Debug, Clone)]
pub struct SemanticFacet {
pub kind: String,
pub meanings: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct Meaning {
pub slug: String,
pub gloss: String,
pub wiktionary: String,
pub wikidata: String,
pub defined_by: Vec<String>,
pub roles: Vec<String>,
pub semantic_facets: Vec<SemanticFacet>,
pub lexemes: Vec<Lexeme>,
}
impl Meaning {
#[must_use]
pub fn has_role(&self, role: &str) -> bool {
self.roles.iter().any(|r| r == role)
}
pub fn semantic_facet_targets<'a>(&'a self, kind: &'a str) -> impl Iterator<Item = &'a str> {
self.semantic_facets
.iter()
.filter(move |facet| facet.kind == kind)
.flat_map(|facet| facet.meanings.iter().map(String::as_str))
}
pub fn words(&self) -> impl Iterator<Item = &str> {
self.lexemes
.iter()
.flat_map(|lexeme| lexeme.words.iter().map(|w| w.text.as_str()))
}
pub fn word_forms(&self) -> impl Iterator<Item = &WordForm> {
self.lexemes.iter().flat_map(|lexeme| lexeme.words.iter())
}
#[must_use]
pub fn evidenced_in(&self, normalized: &str) -> bool {
self.words().any(|word| surface_present(normalized, word))
}
#[must_use]
pub fn word_in(&self, language: &str) -> Option<&str> {
self.lexemes
.iter()
.find(|lexeme| lexeme.language == language)
.and_then(|lexeme| lexeme.words.first().map(|w| w.text.as_str()))
}
#[must_use]
pub fn describe_word(&self, word: &str) -> Option<&str> {
self.word_forms()
.find(|form| form.text.eq_ignore_ascii_case(word))
.map(|form| form.description.as_str())
}
#[must_use]
pub fn languages(&self) -> BTreeSet<String> {
self.lexemes.iter().map(|l| l.language.clone()).collect()
}
#[must_use]
pub fn mentions_in_languages_raw(&self, normalized: &str, languages: &[&str]) -> bool {
self.lexemes
.iter()
.filter(|lexeme| languages.contains(&lexeme.language.as_str()))
.flat_map(|lexeme| lexeme.words.iter())
.any(|word| !word.text.is_empty() && normalized.contains(word.text.as_str()))
}
fn lexeme_lists(&self, language: &str, surface: &str) -> bool {
self.lexemes
.iter()
.filter(|lexeme| lexeme.language == language)
.flat_map(|lexeme| lexeme.words.iter())
.any(|word| word.text == surface)
}
fn lexeme_lists_action(&self, language: &str, surface: &str, action: &str) -> bool {
self.lexemes
.iter()
.filter(|lexeme| lexeme.language == language)
.flat_map(|lexeme| lexeme.words.iter())
.any(|word| word.text == surface && word.action == action)
}
}
pub type WordValueTable = Vec<(String, String)>;
#[derive(Debug, Clone, Default)]
pub struct Lexicon {
pub meanings: Vec<Meaning>,
}
impl Lexicon {
#[must_use]
pub fn meaning(&self, slug: &str) -> Option<&Meaning> {
self.meanings.iter().find(|m| m.slug == slug)
}
#[must_use]
pub fn semantic_facet_meanings(&self, slug: &str, kind: &str) -> Vec<&Meaning> {
let Some(meaning) = self.meaning(slug) else {
return Vec::new();
};
meaning
.semantic_facet_targets(kind)
.filter_map(|target| self.meaning(target))
.collect()
}
#[must_use]
pub fn meaning_by_wikidata(&self, id: &str) -> Option<&Meaning> {
self.meanings.iter().find(|m| m.wikidata == id)
}
pub fn meanings_with_role<'a>(&'a self, role: &'a str) -> impl Iterator<Item = &'a Meaning> {
self.meanings.iter().filter(move |m| m.has_role(role))
}
#[must_use]
pub fn role_word_forms<'a>(&'a self, role: &str) -> Vec<&'a WordForm> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.flat_map(Meaning::word_forms)
.collect()
}
#[must_use]
pub fn role_surface_translation<'a>(
&'a self,
role: &str,
source: &str,
target: &str,
surface: &str,
) -> Option<&'a str> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.lexeme_lists(source, surface))
.and_then(|meaning| meaning.word_in(target))
}
#[must_use]
pub fn role_lists_surface(&self, role: &str, language: &str, surface: &str) -> bool {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.any(|meaning| meaning.lexeme_lists(language, surface))
}
#[must_use]
pub fn role_action_surface_translation<'a>(
&'a self,
role: &str,
action: &str,
source: &str,
target: &str,
surface: &str,
) -> Option<&'a str> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.lexeme_lists_action(source, surface, action))
.and_then(|meaning| meaning.word_in(target))
}
#[must_use]
pub fn words_for_role(&self, role: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
for meaning in self.meanings.iter().filter(|m| m.has_role(role)) {
for word in meaning.words() {
if !out.iter().any(|existing| existing == word) {
out.push(word.to_string());
}
}
}
out
}
#[must_use]
pub fn mentions_role(&self, role: &str, normalized: &str) -> bool {
self.meanings_with_role(role)
.any(|meaning| meaning.evidenced_in(normalized))
}
#[must_use]
pub fn mentions_role_spelled(&self, role: &str, normalized: &str) -> bool {
self.meanings_with_role(role).any(|meaning| {
meaning
.words()
.filter(|word| word.chars().any(char::is_alphabetic))
.any(|word| surface_present(normalized, word))
})
}
#[must_use]
pub fn mentions_role_raw(&self, role: &str, normalized: &str) -> bool {
self.meanings_with_role(role)
.any(|meaning| meaning.words().any(|word| normalized.contains(word)))
}
#[must_use]
pub fn arithmetic_normalization_tables(&self) -> (WordValueTable, WordValueTable) {
let is_value_surface = |word: &str| !word.chars().any(char::is_alphabetic);
let mut tokens: WordValueTable = Vec::new();
let mut phrases: WordValueTable = Vec::new();
for role in [
super::roles::ROLE_CARDINAL_NUMBER_WORD,
super::roles::ROLE_ARITHMETIC_OPERATOR_WORD,
] {
for meaning in self.meanings_with_role(role) {
let Some(value) = meaning.words().find(|&word| is_value_surface(word)) else {
continue;
};
for word in meaning.words() {
if word == value || is_value_surface(word) {
continue;
}
let entry = (word.to_string(), value.to_string());
if word.chars().any(char::is_whitespace) {
phrases.push(entry);
} else {
tokens.push(entry);
}
}
}
}
tokens.sort();
tokens.dedup();
phrases.sort_by(|a, b| {
b.0.chars()
.count()
.cmp(&a.0.chars().count())
.then_with(|| a.0.cmp(&b.0))
});
phrases.dedup();
(tokens, phrases)
}
#[must_use]
pub fn words_for_role_in_languages(&self, role: &str, languages: &[&str]) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
for meaning in self.meanings_with_role(role) {
for lexeme in &meaning.lexemes {
if !languages.contains(&lexeme.language.as_str()) {
continue;
}
for word in &lexeme.words {
if !out.iter().any(|existing| existing == &word.text) {
out.push(word.text.clone());
}
}
}
}
out
}
#[must_use]
pub fn first_role_language(
&self,
role: &str,
normalized: &str,
priority: &[&'static str],
) -> Option<&'static str> {
priority.iter().copied().find(|&lang| {
self.meanings_with_role(role).any(|meaning| {
meaning
.lexemes
.iter()
.filter(|lexeme| lexeme.language == lang)
.any(|lexeme| {
lexeme
.words
.iter()
.any(|word| normalized.contains(word.text.as_str()))
})
})
})
}
#[must_use]
pub fn first_role_match(&self, role: &str, normalized: &str) -> Option<&Meaning> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.evidenced_in(normalized))
}
#[must_use]
pub fn first_role_match_in_languages_raw(
&self,
role: &str,
normalized: &str,
languages: &[&str],
) -> Option<&Meaning> {
self.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| meaning.mentions_in_languages_raw(normalized, languages))
}
#[must_use]
pub fn mentions_role_in_languages_raw(
&self,
role: &str,
normalized: &str,
languages: &[&str],
) -> bool {
self.meanings_with_role(role)
.any(|meaning| meaning.mentions_in_languages_raw(normalized, languages))
}
#[must_use]
pub fn ontology_root(&self) -> Option<&Meaning> {
self.meanings
.iter()
.find(|m| m.has_role(ROLE_ONTOLOGY_ROOT))
}
#[must_use]
pub fn reaches_root(&self, slug: &str) -> bool {
let Some(root) = self.ontology_root() else {
return false;
};
let mut seen: BTreeSet<&str> = BTreeSet::new();
let mut stack: Vec<&str> = vec![slug];
while let Some(current) = stack.pop() {
if current == root.slug {
return true;
}
if !seen.insert(current) {
continue;
}
if let Some(meaning) = self.meaning(current) {
for target in &meaning.defined_by {
stack.push(target.as_str());
}
}
}
false
}
#[must_use]
pub fn ontology_type_root(&self) -> Option<&Meaning> {
self.meanings
.iter()
.find(|m| m.has_role(ROLE_ONTOLOGY_TYPE))
}
pub fn ontology_categories(&self) -> impl Iterator<Item = &Meaning> {
self.meanings_with_role(ROLE_ONTOLOGY_CATEGORY)
}
}
fn surface_present(normalized: &str, expected: &str) -> bool {
if expected.is_empty() {
return false;
}
if crate::coding::contains_cjk(expected) {
return normalized.contains(expected);
}
normalized == expected
|| normalized.starts_with(&format!("{expected} "))
|| normalized.ends_with(&format!(" {expected}"))
|| normalized.contains(&format!(" {expected} "))
}
fn parse_lexicon(text: &str) -> Lexicon {
let root = parse_lino(text);
let mut meanings = Vec::new();
let containers: Vec<&LinoNode> = root
.children
.iter()
.filter(|c| c.name == "meanings")
.collect();
let sources: Vec<&LinoNode> = if containers.is_empty() {
vec![&root]
} else {
containers
};
for container in sources {
for node in container
.children
.iter()
.filter(|c| c.name == "meaning" || c.name != "meanings")
{
meanings.push(parse_meaning(node));
}
}
Lexicon { meanings }
}
fn parse_meaning(node: &LinoNode) -> Meaning {
let slug = meaning_slug(node);
let mut defined_by = Vec::new();
let mut roles = Vec::new();
let semantic_facets = parse_semantic_facets(node);
let mut lexemes = Vec::new();
let mut wikidata = String::new();
if node.name != "meaning" && !node.id.is_empty() {
defined_by.extend(definition_targets(&node.id));
}
for child in &node.children {
match child.name.as_str() {
"defined_by" | "defined-by" => defined_by.extend(definition_targets(&child.id)),
"grounded-in" | "wikidata" => wikidata.clone_from(&child.id),
"role" => roles.push(child.id.clone()),
"lexeme" => {
let words = child
.children
.iter()
.filter(|w| w.name == "word" || w.name == "surface")
.map(|w| parse_word_form(&slug, w))
.collect();
lexemes.push(Lexeme {
language: lexeme_language(child),
words,
});
}
"surface" => {
let language = child.find_child_value("language").to_string();
lexemes.push(Lexeme {
language,
words: vec![parse_word_form(&slug, child)],
});
}
_ => {}
}
}
Meaning {
gloss: generated_meaning_description(&slug, &defined_by, node.find_child_value("gloss")),
slug,
wiktionary: node.find_child_value("wiktionary").to_string(),
wikidata,
defined_by,
roles,
semantic_facets,
lexemes,
}
}
fn definition_targets(raw: &str) -> impl Iterator<Item = String> + '_ {
raw.split(|character: char| {
character.is_whitespace() || matches!(character, '(' | ')' | '[' | ']' | ',')
})
.filter(|target| !target.is_empty())
.map(canonical_definition_target)
}
fn canonical_definition_target(target: &str) -> String {
match target {
"reference_action" => String::from("reference-action"),
"link_action" => String::from("link-action"),
"any_of_reference" => String::from("any-of-reference"),
"any_of_link" => String::from("any-of-link"),
"repeatable_from_zero" => String::from("repeatable-from-zero"),
"zero_or_more" => String::from("zero-or-more"),
"point_at" => String::from("point-at"),
"or_else" => String::from("or-else"),
"is_identity" => String::from("is-identity"),
"is_a_kind_of" => String::from("is-a-kind-of"),
"held_by" => String::from("held-by"),
"together_with" => String::from("together-with"),
"self_equation" => String::from("self-equation"),
"one_symbol_one_meaning" => String::from("one-symbol-one-meaning"),
"sense_split" => String::from("sense-split"),
"bank_river" => String::from("bank-river"),
"bank_money" => String::from("bank-money"),
other => other.to_string(),
}
}
fn parse_word_form(parent_meaning: &str, node: &LinoNode) -> WordForm {
let mut semantic_facets = parse_semantic_facets(node);
ensure_semantic_facet_target(&mut semantic_facets, "notation", "word_surface");
ensure_semantic_facet_target(&mut semantic_facets, "denotation", parent_meaning);
WordForm {
text: surface_text(node),
description: generated_word_description(parent_meaning, node),
action: node.find_child_value("action").to_string(),
semantic_facets,
}
}
fn ensure_semantic_facet_target(facets: &mut Vec<SemanticFacet>, kind: &str, target: &str) {
if let Some(facet) = facets.iter_mut().find(|facet| facet.kind == kind) {
if !facet.meanings.iter().any(|meaning| meaning == target) {
facet.meanings.push(target.to_string());
}
return;
}
facets.push(SemanticFacet {
kind: kind.to_string(),
meanings: vec![target.to_string()],
});
}
const FACET_KINDS: &[&str] = &[
"notation",
"annotation",
"denotation",
"connotation",
"part_of_speech",
"self-equation",
];
fn parse_semantic_facets(node: &LinoNode) -> Vec<SemanticFacet> {
let mut facets: Vec<SemanticFacet> = Vec::new();
for child in &node.children {
if child.name == "facet" {
let targets = child.children.iter().filter_map(semantic_facet_target);
merge_facet_targets(&mut facets, &child.id, targets);
} else if FACET_KINDS.contains(&child.name.as_str()) && !child.id.is_empty() {
merge_facet_targets(&mut facets, &child.name, std::iter::once(child.id.clone()));
}
}
facets
}
fn merge_facet_targets(
facets: &mut Vec<SemanticFacet>,
kind: &str,
targets: impl Iterator<Item = String>,
) {
let position = facets.iter().position(|facet| facet.kind == kind);
let index = position.unwrap_or_else(|| {
facets.push(SemanticFacet {
kind: kind.to_string(),
meanings: Vec::new(),
});
facets.len() - 1
});
for target in targets {
if !facets[index].meanings.contains(&target) {
facets[index].meanings.push(target);
}
}
}
fn meaning_slug(node: &LinoNode) -> String {
if node.name == "meaning" {
node.id.clone()
} else {
node.name.clone()
}
}
fn lexeme_language(node: &LinoNode) -> String {
let explicit = node.find_child_value("language");
if explicit.is_empty() {
node.id.clone()
} else {
explicit.to_string()
}
}
fn surface_text(node: &LinoNode) -> String {
let text = node.find_child_value("text");
if !text.is_empty() {
return text.to_string();
}
let codepoints = node.find_child_value("codepoints");
if codepoints.is_empty() {
node.id.clone()
} else {
decode_codepoints(codepoints)
}
}
fn generated_meaning_description(slug: &str, defined_by: &[String], stored: &str) -> String {
if !stored.is_empty() {
return stored.to_string();
}
if defined_by.is_empty() {
slug.to_string()
} else {
format!("{} defined by {}", slug, defined_by.join(" "))
}
}
fn generated_word_description(parent_meaning: &str, node: &LinoNode) -> String {
let stored = node.find_child_value("description");
if !stored.is_empty() {
return stored.to_string();
}
let surface = surface_text(node);
if surface.is_empty() {
parent_meaning.to_string()
} else {
format!("{surface} denotes {parent_meaning}")
}
}
fn semantic_facet_target(node: &LinoNode) -> Option<String> {
match node.name.as_str() {
"meaning" | "target" | "facet-target" => Some(node.id.clone()),
_ if !node.id.is_empty() => Some(node.id.clone()),
_ if !node.name.is_empty() => Some(node.name.clone()),
_ => None,
}
}
#[must_use]
pub fn lexicon() -> &'static Lexicon {
static CACHE: OnceLock<Lexicon> = OnceLock::new();
CACHE.get_or_init(|| parse_lexicon(&MEANING_FILES.join("\n")))
}