use std::collections::HashSet;
use serde::Deserialize;
use crate::conlang::generate::word;
use crate::conlang::phonology::{allophony_eval, validator};
use crate::conlang::types::{Phonology, TemplateRole};
use crate::language_entry::DictionaryEntry;
#[derive(Debug, Clone, Deserialize, PartialEq)]
pub struct LexProposal {
pub form: String,
pub gloss: String,
#[serde(default)]
pub pos: String,
#[serde(default)]
pub example: String,
#[serde(default)]
pub register: String,
#[serde(default)]
pub domain: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RejectReason {
Illegal,
Homophone,
DuplicateMeaning,
}
impl RejectReason {
pub fn as_str(self) -> &'static str {
match self {
Self::Illegal => "phonotactically illegal",
Self::Homophone => "homophone of an existing word",
Self::DuplicateMeaning => "duplicates an existing meaning",
}
}
}
pub fn build_pool(phon: &Phonology, existing: &[DictionaryEntry], target: usize) -> Vec<String> {
let taken: HashSet<String> = existing
.iter()
.map(|e| surface_key(phon, e.word.trim()))
.collect();
let mut seen: HashSet<String> = HashSet::new();
let mut pool = Vec::new();
for w in word::generate_words(phon, TemplateRole::Root, target.saturating_mul(4).max(8)) {
let key = surface_key(phon, &w);
if taken.contains(&key) || !seen.insert(key) {
continue;
}
pool.push(w);
}
pool
}
pub fn dedup(
phon: &Phonology,
existing: &[DictionaryEntry],
proposals: Vec<LexProposal>,
) -> (Vec<LexProposal>, Vec<(LexProposal, RejectReason)>) {
let mut surfaces: HashSet<String> =
existing.iter().map(|e| surface_key(phon, e.word.trim())).collect();
let mut glosses: HashSet<String> = existing
.iter()
.map(|e| e.translation.trim().to_lowercase())
.filter(|g| !g.is_empty())
.collect();
let check_phonotactics = !phon.constraints.is_empty();
let mut kept = Vec::new();
let mut rejected = Vec::new();
for p in proposals {
let form = p.form.trim();
if form.is_empty() {
rejected.push((p, RejectReason::Illegal));
continue;
}
let underlying = phon.segment(form);
if check_phonotactics && !validator::is_legal(phon, &underlying) {
rejected.push((p, RejectReason::Illegal));
continue;
}
let skey = surface_key(phon, form);
if surfaces.contains(&skey) {
rejected.push((p, RejectReason::Homophone));
continue;
}
let gkey = p.gloss.trim().to_lowercase();
if !gkey.is_empty() && glosses.contains(&gkey) {
rejected.push((p, RejectReason::DuplicateMeaning));
continue;
}
surfaces.insert(skey);
if !gkey.is_empty() {
glosses.insert(gkey);
}
kept.push(p);
}
(kept, rejected)
}
fn surface_key(phon: &Phonology, word: &str) -> String {
allophony_eval::surface_form(phon, &phon.segment(word)).join("")
}
pub fn cosine(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
let na = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let nb = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if na == 0.0 || nb == 0.0 {
0.0
} else {
dot / (na * nb)
}
}
pub fn semantic_filter(
kept: Vec<LexProposal>,
existing_vecs: &[Vec<f32>],
kept_vecs: &[Vec<f32>],
threshold: f32,
) -> (Vec<LexProposal>, Vec<(LexProposal, f32)>) {
let mut accepted_vecs: Vec<Vec<f32>> = existing_vecs.to_vec();
let mut accepted = Vec::new();
let mut rejected = Vec::new();
for (i, p) in kept.into_iter().enumerate() {
let v = &kept_vecs[i];
let max = accepted_vecs.iter().map(|e| cosine(v, e)).fold(0.0f32, f32::max);
if max > threshold {
rejected.push((p, max));
} else {
accepted_vecs.push(v.clone());
accepted.push(p);
}
}
(accepted, rejected)
}
pub fn parse_proposals(raw: &str) -> Result<Vec<LexProposal>, String> {
#[derive(Deserialize)]
struct Wrapper {
#[serde(default)]
entries: Vec<LexProposal>,
}
let start = raw.find('{').ok_or("no JSON object in reply")?;
let end = raw.rfind('}').ok_or("no closing brace in reply")?;
if end < start {
return Err("malformed braces".into());
}
let w: Wrapper = serde_json::from_str(&raw[start..=end]).map_err(|e| e.to_string())?;
Ok(w.entries)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::conlang::types::{Phoneme, PhonemeKind, PhonotacticConstraint};
fn ph(ipa: &str, kind: PhonemeKind) -> Phoneme {
Phoneme { ipa: ipa.into(), romanize: Some(ipa.into()), kind, sonority: None }
}
fn phon() -> Phonology {
let mut p = Phonology {
phonemes: vec![
ph("k", PhonemeKind::Consonant), ph("t", PhonemeKind::Consonant),
ph("r", PhonemeKind::Consonant), ph("a", PhonemeKind::Vowel),
ph("i", PhonemeKind::Vowel),
],
..Default::default()
};
p.classes = [
("C".to_string(), vec!["k", "t", "r"].into_iter().map(String::from).collect()),
("V".to_string(), vec!["a", "i"].into_iter().map(String::from).collect()),
]
.into_iter()
.collect();
p.templates = [(
"root".to_string(),
vec![serde_hjson::from_str(r#"{ "pattern": "C V (C) V" }"#).unwrap()],
)]
.into_iter()
.collect();
p.constraints = vec![PhonotacticConstraint::MaxClusterSize(1)];
p
}
fn entry(word: &str, gloss: &str) -> DictionaryEntry {
DictionaryEntry { word: word.into(), pos: "noun".into(), translation: gloss.into(), ..Default::default() }
}
fn prop(form: &str, gloss: &str) -> LexProposal {
LexProposal {
form: form.into(),
gloss: gloss.into(),
pos: "noun".into(),
example: String::new(),
register: String::new(),
domain: Vec::new(),
}
}
#[test]
fn rejects_illegal_homophone_and_duplicate_meaning() {
let p = phon();
let existing = vec![entry("kara", "stone")];
let proposals = vec![
prop("tira", "river"), prop("krta", "gizmo"), prop("kara", "rock"), prop("tika", "stone"), prop("tira", "lake"), ];
let (kept, rejected) = dedup(&p, &existing, proposals);
assert_eq!(kept, vec![prop("tira", "river")]);
let reasons: Vec<_> = rejected.iter().map(|(_, r)| *r).collect();
assert_eq!(
reasons,
vec![
RejectReason::Illegal,
RejectReason::Homophone,
RejectReason::DuplicateMeaning,
RejectReason::Homophone,
]
);
}
#[test]
fn pool_excludes_existing_and_is_distinct() {
let p = phon();
let existing = vec![entry("ka", "x")];
let pool = build_pool(&p, &existing, 10);
assert!(!pool.is_empty());
let mut seen = HashSet::new();
for w in &pool {
assert!(seen.insert(w.clone()), "pool has a duplicate: {w}");
assert_ne!(w, "ka", "pool must exclude existing words");
}
}
#[test]
fn semantic_filter_rejects_near_synonyms() {
let existing = vec![vec![1.0, 0.0, 0.0]]; let kept = vec![prop("a", "water"), prop("b", "stone"), prop("c", "aqua")];
let kept_vecs = vec![
vec![0.0, 1.0, 0.0], vec![0.99, 0.01, 0.0], vec![0.0, 0.99, 0.01], ];
let (acc, rej) = semantic_filter(kept, &existing, &kept_vecs, 0.9);
assert_eq!(acc, vec![prop("a", "water")]);
assert_eq!(rej.len(), 2);
assert!(rej.iter().all(|(_, sim)| *sim > 0.9));
}
#[test]
fn cosine_basics() {
assert!((cosine(&[1.0, 0.0], &[1.0, 0.0]) - 1.0).abs() < 1e-6);
assert!(cosine(&[1.0, 0.0], &[0.0, 1.0]).abs() < 1e-6);
assert_eq!(cosine(&[0.0, 0.0], &[1.0, 1.0]), 0.0);
}
#[test]
fn parse_tolerates_fences_and_prose() {
let raw = "Here:\n```json\n{ \"entries\": [ { \"form\": \"kara\", \"gloss\": \"stone\", \
\"pos\": \"noun\" } ] }\n```\ndone";
let ps = parse_proposals(raw).unwrap();
assert_eq!(ps, vec![prop("kara", "stone")]);
}
}