use std::collections::BTreeMap;
use std::fmt::Write as _;
use serde_json::Value;
const SINGULAR_FEATURE: &str = "Q110786";
const PLURAL_FEATURE: &str = "Q146786";
const NUMBER_FEATURES: [&str; 2] = [SINGULAR_FEATURE, PLURAL_FEATURE];
#[derive(Debug, Clone, Copy)]
pub struct SourceRef {
pub id: &'static str,
pub json: &'static str,
}
#[derive(Debug, Clone, Copy)]
pub struct ExtraRef {
pub language: &'static str,
pub text: &'static str,
}
#[derive(Debug, Clone, Copy)]
pub struct Concept {
pub name: &'static str,
pub grounded_in: &'static str,
pub search_query: &'static str,
pub source_url: &'static str,
pub kb_path: &'static str,
pub sources: &'static [SourceRef],
pub extras: &'static [ExtraRef],
pub keywords: &'static [&'static str],
}
pub const MEANING_DETAIL_TASK: &str = "Make the tomato meaning more detailed: pin every surface's \
part of speech and grammatical number, ground it in Wikidata, \
and add the missing plural to томат.";
pub const POTATO_DETAIL_TASK: &str = "Please make the potato word and meaning richer — record the \
singular/plural of each surface, add the missing plural form \
potatoes, and keep it grounded in Wikidata.";
pub const TOMATO: Concept = Concept {
name: "tomato",
grounded_in: "Q23501",
search_query: "Wikidata lexemes tomato помидор томат grammatical number forms",
source_url: "https://raw.githubusercontent.com/link-assistant/formal-ai/issue-538-eca4a11c39c6/data/cache/wikidata/lexeme/L170542.json",
kb_path: "meanings-tomato-detail.lino",
sources: &[
SourceRef {
id: "L7993",
json: include_str!("../../data/cache/wikidata/lexeme/L7993.json"),
},
SourceRef {
id: "L3526",
json: include_str!("../../data/cache/wikidata/lexeme/L3526.json"),
},
SourceRef {
id: "L170542",
json: include_str!("../../data/cache/wikidata/lexeme/L170542.json"),
},
],
extras: &[
ExtraRef {
language: "hi",
text: "टमाटर",
},
ExtraRef {
language: "zh",
text: "番茄",
},
ExtraRef {
language: "zh",
text: "西红柿",
},
],
keywords: &["помидор", "томат", "tomato"],
};
pub const POTATO: Concept = Concept {
name: "potato",
grounded_in: "Q10998",
search_query: "Wikidata lexemes potato картофель картошка grammatical number forms",
source_url: "https://raw.githubusercontent.com/link-assistant/formal-ai/issue-538-eca4a11c39c6/data/cache/wikidata/lexeme/L3784.json",
kb_path: "meanings-potato-detail.lino",
sources: &[SourceRef {
id: "L3784",
json: include_str!("../../data/cache/wikidata/lexeme/L3784.json"),
}],
extras: &[
ExtraRef {
language: "ru",
text: "картофель",
},
ExtraRef {
language: "ru",
text: "картошка",
},
ExtraRef {
language: "hi",
text: "आलू",
},
ExtraRef {
language: "zh",
text: "土豆",
},
ExtraRef {
language: "zh",
text: "马铃薯",
},
],
keywords: &["potato", "картофель", "картошка", "आलू", "土豆", "马铃薯"],
};
pub const CONCEPTS: &[&Concept] = &[&TOMATO, &POTATO];
const DETAIL_KEYWORDS: [&str; 6] = [
"grammatical number",
"more detailed",
"singular or plural",
"part of speech",
"detailed meaning",
"detailed word",
];
#[must_use]
pub fn concept_for_task(prompt: &str) -> Option<&'static Concept> {
let lower = prompt.to_lowercase();
CONCEPTS.iter().copied().find(|concept| {
concept
.keywords
.iter()
.any(|keyword| lower.contains(&keyword.to_lowercase()))
})
}
#[must_use]
pub fn is_meaning_detail_task(prompt: &str) -> bool {
let lower = prompt.to_lowercase();
DETAIL_KEYWORDS
.iter()
.any(|keyword| lower.contains(keyword))
|| concept_for_task(prompt).is_some()
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LexemeForm {
pub suffix: String,
pub text: String,
pub number: String,
pub feature: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SourceLexeme {
pub id: String,
pub language: String,
pub language_item: String,
pub category: String,
pub sense: Option<String>,
pub forms: Vec<LexemeForm>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ExtraSurface {
pub language: String,
pub text: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ConceptLexemes {
pub sources: Vec<SourceLexeme>,
pub extras: Vec<ExtraSurface>,
}
#[derive(Debug, Clone)]
struct RawLexeme {
language_item: String,
language_code: String,
lemma: String,
category: String,
sense: Option<String>,
forms: Vec<RawForm>,
}
#[derive(Debug, Clone)]
struct RawForm {
id: String,
text: String,
features: Vec<String>,
}
fn parse_entities(text: &str) -> BTreeMap<String, RawLexeme> {
let mut map = BTreeMap::new();
let Ok(doc) = serde_json::from_str::<Value>(text) else {
return map;
};
let Some(entities) = doc.get("entities").and_then(Value::as_object) else {
return map;
};
for (id, entity) in entities {
if let Some(raw) = parse_lexeme_entity(entity) {
map.insert(id.clone(), raw);
}
}
map
}
fn parse_lexeme_entity(entity: &Value) -> Option<RawLexeme> {
let language_item = entity.get("language")?.as_str()?.to_owned();
let category = entity.get("lexicalCategory")?.as_str()?.to_owned();
let lemmas = entity.get("lemmas")?.as_object()?;
let (language_code, lemma_value) = lemmas.iter().next()?;
let language_code = language_code.clone();
let lemma = lemma_value
.get("value")
.and_then(Value::as_str)
.unwrap_or_default()
.to_owned();
let sense = entity
.get("senses")
.and_then(Value::as_array)
.and_then(|senses| senses.first())
.and_then(|sense| sense.get("id"))
.and_then(Value::as_str)
.map(str::to_owned);
let mut forms = Vec::new();
if let Some(array) = entity.get("forms").and_then(Value::as_array) {
for form in array {
if let Some(raw) = parse_form(form) {
forms.push(raw);
}
}
}
Some(RawLexeme {
language_item,
language_code,
lemma,
category,
sense,
forms,
})
}
fn parse_form(form: &Value) -> Option<RawForm> {
let id = form.get("id")?.as_str()?.to_owned();
let text = form
.get("representations")?
.as_object()?
.values()
.next()?
.get("value")?
.as_str()?
.to_owned();
let features = form
.get("grammaticalFeatures")
.and_then(Value::as_array)
.map(|items| {
items
.iter()
.filter_map(Value::as_str)
.map(str::to_owned)
.collect()
})
.unwrap_or_default();
Some(RawForm { id, text, features })
}
fn has_feature(form: &RawForm, feature: &str) -> bool {
form.features.iter().any(|value| value == feature)
}
fn case_key(form: &RawForm) -> Vec<String> {
let mut features: Vec<String> = form
.features
.iter()
.filter(|feature| !NUMBER_FEATURES.contains(&feature.as_str()))
.cloned()
.collect();
features.sort();
features
}
fn select_singular(raw: &RawLexeme) -> Option<&RawForm> {
let singulars: Vec<&RawForm> = raw
.forms
.iter()
.filter(|form| has_feature(form, SINGULAR_FEATURE))
.collect();
singulars
.iter()
.copied()
.find(|form| form.text == raw.lemma)
.or_else(|| singulars.first().copied())
}
fn select_plural<'a>(raw: &'a RawLexeme, singular: Option<&RawForm>) -> Option<&'a RawForm> {
let plurals: Vec<&RawForm> = raw
.forms
.iter()
.filter(|form| has_feature(form, PLURAL_FEATURE))
.collect();
let key = singular.map(case_key);
key.and_then(|key| plurals.iter().copied().find(|form| case_key(form) == key))
.or_else(|| plurals.first().copied())
}
fn form_suffix(form_id: &str) -> String {
form_id.rsplit('-').next().unwrap_or(form_id).to_owned()
}
fn lexeme_form(form: &RawForm, number: &str, feature: &str) -> LexemeForm {
LexemeForm {
suffix: form_suffix(&form.id),
text: form.text.clone(),
number: number.to_owned(),
feature: feature.to_owned(),
}
}
fn derive_source(id: &str, raw: &RawLexeme) -> SourceLexeme {
let singular = select_singular(raw);
let plural = select_plural(raw, singular);
let mut forms = Vec::new();
if let Some(form) = singular {
forms.push(lexeme_form(form, "singular", SINGULAR_FEATURE));
}
if let Some(form) = plural {
forms.push(lexeme_form(form, "plural", PLURAL_FEATURE));
}
SourceLexeme {
id: id.to_owned(),
language: raw.language_code.clone(),
language_item: raw.language_item.clone(),
category: raw.category.clone(),
sense: raw.sense.clone(),
forms,
}
}
#[must_use]
pub fn concept_lexemes(concept: &Concept, fetched: Option<&str>) -> ConceptLexemes {
let entities = fetched
.map(parse_entities)
.filter(|map| {
concept
.sources
.iter()
.all(|source| map.contains_key(source.id))
})
.unwrap_or_else(|| embedded_entities(concept));
let sources = concept
.sources
.iter()
.filter_map(|source| {
entities
.get(source.id)
.map(|raw| derive_source(source.id, raw))
})
.collect();
let extras = concept
.extras
.iter()
.map(|extra| ExtraSurface {
language: extra.language.to_owned(),
text: extra.text.to_owned(),
})
.collect();
ConceptLexemes { sources, extras }
}
fn embedded_entities(concept: &Concept) -> BTreeMap<String, RawLexeme> {
let mut map = BTreeMap::new();
for source in concept.sources {
map.extend(parse_entities(source.json));
}
map
}
fn lexeme_core(entity: &Value) -> Value {
let field = |key: &str| entity.get(key).cloned().unwrap_or(Value::Null);
let forms = entity
.get("forms")
.and_then(Value::as_array)
.map_or_else(Vec::new, |array| {
array
.iter()
.map(|form| {
let mut object = serde_json::Map::new();
object.insert(
"id".to_owned(),
form.get("id").cloned().unwrap_or(Value::Null),
);
object.insert(
"representations".to_owned(),
form.get("representations").cloned().unwrap_or(Value::Null),
);
object.insert(
"grammaticalFeatures".to_owned(),
form.get("grammaticalFeatures")
.cloned()
.unwrap_or(Value::Null),
);
Value::Object(object)
})
.collect()
});
let senses = entity
.get("senses")
.and_then(Value::as_array)
.map_or_else(Vec::new, |array| {
array
.iter()
.map(|sense| {
let mut object = serde_json::Map::new();
object.insert(
"id".to_owned(),
sense.get("id").cloned().unwrap_or(Value::Null),
);
Value::Object(object)
})
.collect()
});
let mut object = serde_json::Map::new();
object.insert("id".to_owned(), field("id"));
object.insert("lemmas".to_owned(), field("lemmas"));
object.insert("language".to_owned(), field("language"));
object.insert("lexicalCategory".to_owned(), field("lexicalCategory"));
object.insert("forms".to_owned(), Value::Array(forms));
object.insert("senses".to_owned(), Value::Array(senses));
Value::Object(object)
}
#[must_use]
pub fn source_bundle(concept: &Concept) -> String {
let mut entities = serde_json::Map::new();
for source in concept.sources {
if let Ok(Value::Object(doc)) = serde_json::from_str::<Value>(source.json) {
if let Some(Value::Object(map)) = doc.get("entities") {
for (id, entity) in map {
entities.insert(id.clone(), lexeme_core(entity));
}
}
}
}
let mut root = serde_json::Map::new();
root.insert("entities".to_owned(), Value::Object(entities));
serde_json::to_string(&Value::Object(root)).unwrap_or_default()
}
fn language_name(code: &str) -> &'static str {
match code {
"en" => "english",
"ru" => "russian",
"hi" => "hindi",
"zh" => "chinese",
_ => "unknown",
}
}
#[must_use]
pub fn render_block(concept: &Concept, lexemes: &ConceptLexemes) -> String {
let mut out = String::new();
let _ = writeln!(out, " {}", concept.name);
let _ = writeln!(out, " grounded-in {}", concept.grounded_in);
let _ = writeln!(out, " defined-by entity");
let _ = writeln!(out, " role compositional_lemma");
for source in &lexemes.sources {
let name = language_name(&source.language);
let lemma = source.forms.first().map(|form| form.text.as_str());
let comment_lemma = source.language == "ru";
let lemma_suffix = match (comment_lemma, lemma) {
(true, Some(text)) => format!(" {text}"),
_ => String::new(),
};
let _ = writeln!(
out,
" source-lexeme {} # wikidata {name} source lexeme{lemma_suffix}",
source.id
);
let _ = writeln!(
out,
" language {} # wikidata language {name}",
source.language_item
);
let _ = writeln!(
out,
" lexical-category {} # wikidata category noun",
source.category
);
for form in &source.forms {
let _ = writeln!(
out,
" form {}-{} # wikidata form {}",
source.id, form.suffix, form.text
);
let _ = writeln!(
out,
" feature {} # wikidata grammatical feature {}",
form.feature, form.number
);
}
if let Some(sense) = &source.sense {
let _ = writeln!(out, " sense {sense} # wikidata grounded sense");
}
for form in &source.forms {
let comment_text = if comment_lemma {
format!(" {}", form.text)
} else {
String::new()
};
let _ = writeln!(
out,
" surface {}-{} # wikidata {name} {} surface{comment_text}",
source.id, form.suffix, form.number
);
let _ = writeln!(out, " text {}", form.text);
let _ = writeln!(out, " language {}", source.language);
let _ = writeln!(out, " part_of_speech noun");
let _ = writeln!(out, " grammatical_number {}", form.number);
if let Some(sense) = &source.sense {
let _ = writeln!(out, " sense {sense} # wikidata grounded sense");
}
}
}
let mut seen_languages: Vec<&str> = Vec::new();
for extra in &lexemes.extras {
if !seen_languages.contains(&extra.language.as_str()) {
seen_languages.push(&extra.language);
}
}
for language in seen_languages {
let _ = writeln!(out, " lexeme {language}");
for extra in lexemes.extras.iter().filter(|e| e.language == language) {
let _ = writeln!(out, " surface");
let _ = writeln!(out, " text {}", extra.text);
let _ = writeln!(out, " part_of_speech noun");
}
}
out
}
#[must_use]
pub fn enrich_block(concept: &Concept, fetched: Option<&str>) -> String {
render_block(concept, &concept_lexemes(concept, fetched))
}
#[must_use]
pub fn final_answer_for(concept: &Concept, block: &str) -> String {
format!(
"Made the {name} meaning more detailed: every surface now pins its part of speech and \
grammatical number, is grounded in its Wikidata lexeme forms, and every plural surface \
recovered from the source is added.\n\n\
Enriched meaning block ({path}):\n\n{block}",
name = concept.name,
path = concept.kb_path,
block = block.trim_end(),
)
}