use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Pair {
english: String,
conlang: String,
#[serde(default)]
embedding: Vec<f32>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct TranslationMemory {
pairs: Vec<Pair>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum MemoryHit {
Exact { conlang: String },
Fuzzy { conlang: String, score: f32, english: String },
None,
}
const FUZZY_THRESHOLD: f32 = 0.5;
const SEMANTIC_THRESHOLD: f32 = 0.82;
fn cosine(a: &[f32], b: &[f32]) -> f32 {
if a.is_empty() || a.len() != b.len() {
return 0.0;
}
let mut dot = 0.0f32;
let mut na = 0.0f32;
let mut nb = 0.0f32;
for i in 0..a.len() {
dot += a[i] * b[i];
na += a[i] * a[i];
nb += b[i] * b[i];
}
if na == 0.0 || nb == 0.0 {
0.0
} else {
dot / (na.sqrt() * nb.sqrt())
}
}
fn tokens(s: &str) -> Vec<String> {
s.split(|c: char| !c.is_alphanumeric())
.filter(|w| !w.is_empty())
.map(|w| w.to_lowercase())
.collect()
}
fn jaccard(a: &[String], b: &[String]) -> f32 {
if a.is_empty() || b.is_empty() {
return 0.0;
}
let inter = a.iter().filter(|t| b.contains(t)).count();
let union = a.len() + b.len() - inter;
if union == 0 {
0.0
} else {
inter as f32 / union as f32
}
}
impl TranslationMemory {
pub fn add(&mut self, english: &str, conlang: &str) {
let key = tokens(english);
if let Some(p) = self.pairs.iter_mut().find(|p| tokens(&p.english) == key) {
p.conlang = conlang.to_string();
p.english = english.to_string();
p.embedding.clear();
} else {
self.pairs.push(Pair {
english: english.to_string(),
conlang: conlang.to_string(),
embedding: Vec::new(),
});
}
}
pub fn needs_embeddings(&self) -> Vec<String> {
self.pairs.iter().filter(|p| p.embedding.is_empty()).map(|p| p.english.clone()).collect()
}
pub fn set_embedding(&mut self, english: &str, embedding: Vec<f32>) {
let key = tokens(english);
if let Some(p) = self.pairs.iter_mut().find(|p| tokens(&p.english) == key) {
p.embedding = embedding;
}
}
pub fn best(&self, english: &str, query_embedding: Option<&[f32]>) -> MemoryHit {
let q = tokens(english);
if let Some(p) = self.pairs.iter().find(|p| tokens(&p.english) == q) {
return MemoryHit::Exact { conlang: p.conlang.clone() };
}
if let Some(qv) = query_embedding {
let mut best: Option<(&Pair, f32)> = None;
for p in &self.pairs {
if p.embedding.is_empty() {
continue;
}
let s = cosine(qv, &p.embedding);
if s >= SEMANTIC_THRESHOLD && best.map(|(_, b)| s > b).unwrap_or(true) {
best = Some((p, s));
}
}
if let Some((p, score)) = best {
return MemoryHit::Fuzzy {
conlang: p.conlang.clone(),
score,
english: p.english.clone(),
};
}
}
let mut best: Option<(&Pair, f32)> = None;
for p in &self.pairs {
let s = jaccard(&q, &tokens(&p.english));
if s >= FUZZY_THRESHOLD && best.map(|(_, b)| s > b).unwrap_or(true) {
best = Some((p, s));
}
}
match best {
Some((p, score)) => MemoryHit::Fuzzy {
conlang: p.conlang.clone(),
score,
english: p.english.clone(),
},
None => MemoryHit::None,
}
}
pub fn len(&self) -> usize {
self.pairs.len()
}
pub fn is_empty(&self) -> bool {
self.pairs.is_empty()
}
pub fn entries(&self) -> impl Iterator<Item = (&str, &str)> {
self.pairs.iter().map(|p| (p.english.as_str(), p.conlang.as_str()))
}
pub fn sidecar_path(project_root: &Path, language: &str) -> PathBuf {
project_root
.join(".inkhaven")
.join("translation-memory")
.join(format!("{}.json", language.to_lowercase()))
}
pub fn load(project_root: &Path, language: &str) -> std::io::Result<Self> {
let path = Self::sidecar_path(project_root, language);
match std::fs::read_to_string(&path) {
Ok(s) => serde_json::from_str(&s)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Self::default()),
Err(e) => Err(e),
}
}
pub fn save(&self, project_root: &Path, language: &str) -> std::io::Result<()> {
let path = Self::sidecar_path(project_root, language);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let body = serde_json::to_vec_pretty(self)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
crate::io_atomic::write(&path, &body)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn exact_match_is_normalized() {
let mut m = TranslationMemory::default();
m.add("The bird sees the stone.", "kira nami pata");
assert_eq!(
m.best("the bird sees the stone", None),
MemoryHit::Exact { conlang: "kira nami pata".into() }
);
}
#[test]
fn re_adding_supersedes() {
let mut m = TranslationMemory::default();
m.add("the bird flies", "kira aaa");
m.add("the bird flies", "kira bbb"); assert_eq!(m.len(), 1);
assert_eq!(m.best("the bird flies", None), MemoryHit::Exact { conlang: "kira bbb".into() });
}
#[test]
fn near_match_is_fuzzy() {
let mut m = TranslationMemory::default();
m.add("the bird sees the stone", "kira nami pata");
match m.best("the bird sees a stone", None) {
MemoryHit::Fuzzy { conlang, score, .. } => {
assert_eq!(conlang, "kira nami pata");
assert!(score >= 0.5 && score < 1.0);
}
other => panic!("expected fuzzy, got {other:?}"),
}
}
#[test]
fn semantic_match_uses_cosine_when_lexical_fails() {
let mut m = TranslationMemory::default();
m.add("the warrior raises his sword", "AAA");
m.add("the bird sees the stone", "BBB");
m.set_embedding("the warrior raises his sword", vec![1.0, 0.0, 0.0]);
m.set_embedding("the bird sees the stone", vec![0.0, 1.0, 0.0]);
let q = [0.96, 0.1, 0.0];
match m.best("a soldier lifts a blade", Some(&q)) {
MemoryHit::Fuzzy { conlang, score, .. } => {
assert_eq!(conlang, "AAA");
assert!(score >= 0.82);
}
other => panic!("expected a semantic match, got {other:?}"),
}
assert_eq!(m.best("a soldier lifts a blade", None), MemoryHit::None);
}
#[test]
fn unrelated_is_a_miss() {
let mut m = TranslationMemory::default();
m.add("the bird sees the stone", "kira nami pata");
assert_eq!(m.best("a dragon burns the tower", None), MemoryHit::None);
}
}