use super::http::{HttpClient, HttpError};
use super::meaning::MeaningId;
use super::wikidata::Wikidata;
use super::wiktionary::{Wiktionary, WiktionaryCandidate};
use crate::seed::{
ROLE_COMPOSITIONAL_GENITIVE_HEAD, ROLE_COMPOSITIONAL_LEMMA, ROLE_COMPOSITIONAL_PHRASE,
};
fn translation_debug_enabled() -> bool {
std::env::var("FORMAL_AI_TRANSLATION_DEBUG")
.ok()
.is_some_and(|value| !value.is_empty() && value != "0")
}
fn translation_debug(stage: &str, message: &str) {
if translation_debug_enabled() {
eprintln!("[formal-ai translation] {stage}: {message}");
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Translation {
pub source_surface: String,
pub source_lang: String,
pub target_lang: String,
pub meaning: MeaningId,
pub candidates: Vec<WiktionaryCandidate>,
pub provenance: Vec<String>,
}
impl Translation {
#[must_use]
pub fn primary_surface(&self) -> Option<&str> {
self.candidates
.iter()
.find(|c| c.qualifier.is_none())
.or_else(|| self.candidates.first())
.map(|c| c.surface.as_str())
}
}
pub struct TranslationPipeline<'a, T: HttpClient + ?Sized> {
http: &'a T,
}
impl<'a, T: HttpClient + ?Sized> TranslationPipeline<'a, T> {
pub const fn new(http: &'a T) -> Self {
Self { http }
}
pub fn translate(
&self,
surface: &str,
source_lang: &str,
target_lang: &str,
) -> Result<Translation, HttpError> {
translation_debug(
"translate",
&format!("start surface={surface:?} {source_lang}->{target_lang}"),
);
let mut provenance: Vec<String> = Vec::new();
let page_title = normalize_page_title(surface);
translation_debug("translate", &format!("page_title={page_title:?}"));
if source_lang == target_lang {
translation_debug("translate", "identity (source==target)");
let mut candidates = vec![WiktionaryCandidate {
surface: surface.to_owned(),
qualifier: None,
}];
provenance.push("identity".to_owned());
let meaning = upgrade_meaning_via_wikidata(
self.http,
&page_title,
source_lang,
target_lang,
&mut provenance,
&mut candidates,
)
.unwrap_or_else(|| MeaningId::from_wiktionary_page(source_lang, &page_title));
return Ok(Translation {
source_surface: surface.to_owned(),
source_lang: source_lang.to_owned(),
target_lang: target_lang.to_owned(),
meaning,
candidates,
provenance,
});
}
if let Some((slug, target_surface)) =
seed_compositional_translation(&page_title, source_lang, target_lang)
{
translation_debug("translate", "seed compositional hit");
provenance.push(format!(
"compositional:{source_lang}->{target_lang}:{page_title}"
));
return Ok(Translation {
source_surface: surface.to_owned(),
source_lang: source_lang.to_owned(),
target_lang: target_lang.to_owned(),
meaning: seed_meaning_id(slug),
candidates: vec![WiktionaryCandidate {
surface: target_surface.to_owned(),
qualifier: None,
}],
provenance,
});
}
let source_wiktionary = Wiktionary::new(source_lang, self.http);
let mut blocks: Vec<Vec<WiktionaryCandidate>> = Vec::new();
let mut meaning = MeaningId::from_wiktionary_page(source_lang, &page_title);
match source_wiktionary.translation_blocks(&page_title, target_lang) {
Ok(found) => {
provenance.push(format!(
"wiktionary:{source_lang}:{page_title}#translations->{target_lang}"
));
translation_debug("stage1", &format!("source-edition blocks={}", found.len()));
if !found.is_empty() {
blocks = found;
}
}
Err(error) => {
provenance.push(format!(
"wiktionary:{source_lang}:{page_title}#translations->error({error})",
));
translation_debug("stage1", &format!("source-edition error: {error}"));
}
}
let main_wikitext = source_wiktionary.wikitext(&page_title).ok();
let main_delegates_subpage = main_wikitext
.as_deref()
.is_some_and(|wt| wt.contains("{{see translation subpage|"));
if blocks.is_empty() || main_delegates_subpage {
let subpage = format!("{page_title}/translations");
match source_wiktionary.translation_blocks(&subpage, target_lang) {
Ok(found) if !found.is_empty() => {
provenance.push(format!(
"wiktionary:{source_lang}:{subpage}#translations->{target_lang}"
));
let mut merged = found;
merged.extend(std::mem::take(&mut blocks));
blocks = merged;
}
Ok(_) => {}
Err(error) => {
provenance.push(format!(
"wiktionary:{source_lang}:{subpage}#translations->error({error})",
));
}
}
}
if blocks.is_empty() {
if let Some(reverse) = reverse_lookup(
self.http,
surface,
source_lang,
target_lang,
&mut provenance,
) {
blocks = reverse;
}
}
let mut active_page_title = page_title.clone();
if blocks.is_empty() {
for variant in phrasal_variants(&page_title, source_lang) {
provenance.push(format!("wiktionary:{source_lang}:variant->{variant}"));
match source_wiktionary.translation_blocks(&variant, target_lang) {
Ok(found) if !found.is_empty() => {
provenance.push(format!(
"wiktionary:{source_lang}:{variant}#translations->{target_lang}"
));
blocks = found;
active_page_title.clone_from(&variant);
meaning = MeaningId::from_wiktionary_page(source_lang, &variant);
break;
}
Ok(_) => {}
Err(error) => {
provenance.push(format!(
"wiktionary:{source_lang}:{variant}#translations->error({error})"
));
}
}
if let Some(reverse) = reverse_lookup(
self.http,
&variant,
source_lang,
target_lang,
&mut provenance,
) {
blocks = reverse;
active_page_title.clone_from(&variant);
meaning = MeaningId::from_wiktionary_page(source_lang, &variant);
break;
}
}
}
let candidates = if blocks.is_empty() {
Vec::new()
} else {
select_best_block(
self.http,
&active_page_title,
source_lang,
target_lang,
&mut provenance,
blocks,
)
};
let mut candidates = candidates;
if let Some(updated) = upgrade_meaning_via_wikidata(
self.http,
&active_page_title,
source_lang,
target_lang,
&mut provenance,
&mut candidates,
) {
meaning = updated;
}
if candidates.is_empty() {
candidates = compositional_candidates(
&active_page_title,
source_lang,
target_lang,
&mut meaning,
&mut provenance,
);
}
translation_debug(
"translate",
&format!(
"done candidates={} primary={:?} meaning={:?}",
candidates.len(),
candidates
.iter()
.find(|c| c.qualifier.is_none())
.or_else(|| candidates.first())
.map(|c| c.surface.as_str()),
meaning,
),
);
Ok(Translation {
source_surface: surface.to_owned(),
source_lang: source_lang.to_owned(),
target_lang: target_lang.to_owned(),
meaning,
candidates,
provenance,
})
}
}
fn reverse_lookup<T: HttpClient + ?Sized>(
http: &T,
surface: &str,
source_lang: &str,
target_lang: &str,
provenance: &mut Vec<String>,
) -> Option<Vec<Vec<WiktionaryCandidate>>> {
let page_title = normalize_page_title(surface);
for edition in [source_lang, target_lang] {
let wiktionary = Wiktionary::new(edition, http);
match wiktionary.translation_blocks(&page_title, target_lang) {
Ok(blocks) if !blocks.is_empty() => {
provenance.push(format!(
"wiktionary:{edition}:{page_title}#reverse->{target_lang}"
));
return Some(blocks);
}
Ok(_) => {}
Err(error) => {
provenance.push(format!(
"wiktionary:{edition}:{page_title}#reverse->error({error})"
));
}
}
}
None
}
fn select_best_block<T: HttpClient + ?Sized>(
http: &T,
page_title: &str,
source_lang: &str,
target_lang: &str,
provenance: &mut Vec<String>,
blocks: Vec<Vec<WiktionaryCandidate>>,
) -> Vec<WiktionaryCandidate> {
let target_wiktionary = Wiktionary::new(target_lang, http);
let mut block_positions: Vec<Vec<Option<usize>>> = Vec::with_capacity(blocks.len());
for (block_idx, block) in blocks.iter().enumerate() {
let mut positions: Vec<Option<usize>> = Vec::with_capacity(block.len());
for candidate in block {
let candidate_page = normalize_page_title(&candidate.surface);
if candidate_page.is_empty() {
positions.push(None);
continue;
}
let Ok(back_blocks) =
target_wiktionary.translation_blocks(&candidate_page, source_lang)
else {
positions.push(None);
continue;
};
let mut within_block_position: Option<usize> = None;
for back_block in &back_blocks {
if let Some(pos) = back_block
.iter()
.position(|row| normalize_page_title(&row.surface) == page_title)
{
within_block_position = Some(pos);
break;
}
}
if let Some(pos) = within_block_position {
provenance.push(format!(
"wiktionary:{target_lang}:{candidate_page}#confirms->{source_lang}:{page_title}@{pos}[block{block_idx}]"
));
}
positions.push(within_block_position);
}
block_positions.push(positions);
}
let mut best_block: usize = 0;
let mut best_confirms: usize = 0;
for (idx, positions) in block_positions.iter().enumerate() {
let confirms = positions.iter().filter(|p| p.is_some()).count();
if confirms > best_confirms {
best_confirms = confirms;
best_block = idx;
}
}
let block = blocks.into_iter().nth(best_block).unwrap_or_default();
let positions = block_positions
.into_iter()
.nth(best_block)
.unwrap_or_default();
let mut indexed: Vec<(usize, Option<usize>, WiktionaryCandidate)> = block
.into_iter()
.zip(positions)
.enumerate()
.map(|(idx, (cand, pos))| (idx, pos, cand))
.collect();
indexed.sort_by_key(|(idx, pos, _)| {
pos.as_ref()
.map_or((1usize, 0, *idx), |p| (0usize, *p, *idx))
});
indexed.into_iter().map(|(_, _, cand)| cand).collect()
}
fn upgrade_meaning_via_wikidata<T: HttpClient + ?Sized>(
http: &T,
page_title: &str,
source_lang: &str,
target_lang: &str,
provenance: &mut Vec<String>,
candidates: &mut Vec<WiktionaryCandidate>,
) -> Option<MeaningId> {
let wikidata = Wikidata::new(http);
let hits = match wikidata.search_lexeme(page_title, source_lang) {
Ok(hits) => hits,
Err(error) => {
provenance.push(format!("wikidata:search->error({error})"));
return None;
}
};
let first = hits.first()?;
provenance.push(format!("wikidata:lexeme:{}", first.id));
let mut meaning = MeaningId::from_sense(first.id.clone());
let lemmas = match wikidata.lexeme_translations(&first.id, target_lang) {
Ok(rows) => rows,
Err(error) => {
provenance.push(format!("wikidata:sparql->error({error})"));
if let Some(canonical) = canonical_target_english_meaning(
&wikidata,
source_lang,
target_lang,
candidates,
provenance,
) {
meaning = canonical;
}
return Some(meaning);
}
};
if !lemmas.is_empty() {
provenance.push(format!(
"wikidata:sparql:{}->{} ({} lemmas)",
first.id,
target_lang,
lemmas.len()
));
}
for lemma in lemmas {
let candidate = WiktionaryCandidate {
surface: lemma.value,
qualifier: None,
};
if !candidates.iter().any(|c| c.surface == candidate.surface) {
candidates.push(candidate);
}
}
if let Some(canonical) = canonical_target_english_meaning(
&wikidata,
source_lang,
target_lang,
candidates,
provenance,
) {
meaning = canonical;
}
Some(meaning)
}
fn canonical_target_english_meaning<T: HttpClient + ?Sized>(
wikidata: &Wikidata<'_, T>,
source_lang: &str,
target_lang: &str,
candidates: &[WiktionaryCandidate],
provenance: &mut Vec<String>,
) -> Option<MeaningId> {
if source_lang.eq_ignore_ascii_case("en") || !target_lang.eq_ignore_ascii_case("en") {
return None;
}
let candidate = candidates
.iter()
.find(|candidate| candidate.qualifier.is_none())
.or_else(|| candidates.first())?;
let lemma = normalize_page_title(&candidate.surface);
if lemma.is_empty() {
return None;
}
match wikidata.search_lexeme(&lemma, "en") {
Ok(hits) => {
let first = hits.first()?;
provenance.push(format!("wikidata:canonical_lexeme:{}", first.id));
Some(MeaningId::from_sense(first.id.clone()))
}
Err(error) => {
provenance.push(format!("wikidata:canonical_search->error({error})"));
None
}
}
}
fn compositional_candidates(
page_title: &str,
source_lang: &str,
target_lang: &str,
meaning: &mut MeaningId,
provenance: &mut Vec<String>,
) -> Vec<WiktionaryCandidate> {
if let Some((slug, surface)) =
seed_compositional_translation(page_title, source_lang, target_lang)
{
provenance.push(format!(
"compositional:{source_lang}->{target_lang}:{page_title}"
));
*meaning = seed_meaning_id(slug);
return vec![WiktionaryCandidate {
surface: surface.to_owned(),
qualifier: None,
}];
}
if source_lang.eq_ignore_ascii_case("ru") && target_lang.eq_ignore_ascii_case("en") {
if let Some(surface) = russian_phrase_to_english(page_title) {
provenance.push(format!("compositional:ru->en:{page_title}"));
return vec![WiktionaryCandidate {
surface: surface.to_owned(),
qualifier: None,
}];
}
}
for variant in phrasal_variants(page_title, source_lang) {
if let Some((slug, surface)) =
seed_compositional_translation(&variant, source_lang, target_lang)
{
provenance.push(format!(
"compositional:{source_lang}->{target_lang}:{page_title}=>variant:{variant}"
));
*meaning = seed_meaning_id(slug);
return vec![WiktionaryCandidate {
surface: surface.to_owned(),
qualifier: None,
}];
}
}
if !source_lang.eq_ignore_ascii_case("ru") || !target_lang.eq_ignore_ascii_case("en") {
return Vec::new();
}
let words: Vec<&str> = page_title.split_whitespace().collect();
if !(2..=8).contains(&words.len()) {
return Vec::new();
}
let Some(surface) = translate_russian_word_sequence(&words) else {
return Vec::new();
};
provenance.push(format!("compositional:ru->en:{page_title}"));
vec![WiktionaryCandidate {
surface,
qualifier: None,
}]
}
fn seed_meaning_id(slug: &str) -> MeaningId {
MeaningId::from_wiktionary_page("seed", slug)
}
pub(crate) fn seed_meaning_for_surface(surface: &str, language: &str) -> Option<MeaningId> {
let page_title = normalize_page_title(surface);
seed_compositional_translation(&page_title, language, language)
.map(|(slug, _)| seed_meaning_id(slug))
}
fn seed_compositional_translation(
page_title: &str,
source_lang: &str,
target_lang: &str,
) -> Option<(&'static str, &'static str)> {
seed_role_translation(
ROLE_COMPOSITIONAL_PHRASE,
source_lang,
target_lang,
page_title,
)
.or_else(|| {
seed_role_translation(
ROLE_COMPOSITIONAL_LEMMA,
source_lang,
target_lang,
page_title,
)
})
}
fn seed_role_translation(
role: &str,
source_lang: &str,
target_lang: &str,
surface: &str,
) -> Option<(&'static str, &'static str)> {
crate::seed::lexicon()
.meanings
.iter()
.filter(|meaning| meaning.has_role(role))
.find(|meaning| {
meaning.lexemes.iter().any(|lexeme| {
lexeme.language.eq_ignore_ascii_case(source_lang)
&& lexeme
.words
.iter()
.any(|word| same_surface(&word.text, surface))
})
})
.and_then(|meaning| {
meaning
.word_in(target_lang)
.map(|target| (meaning.slug.as_str(), target))
})
}
fn same_surface(left: &str, right: &str) -> bool {
left == right || left.eq_ignore_ascii_case(right)
}
fn russian_phrase_to_english(page_title: &str) -> Option<&'static str> {
crate::seed::lexicon().role_surface_translation(
ROLE_COMPOSITIONAL_PHRASE,
"ru",
"en",
page_title,
)
}
fn russian_word_to_english(word: &str) -> Option<&'static str> {
crate::seed::lexicon().role_surface_translation(ROLE_COMPOSITIONAL_LEMMA, "ru", "en", word)
}
fn translate_russian_word_sequence(words: &[&str]) -> Option<String> {
let mut translated: Vec<&str> = Vec::with_capacity(words.len() + 2);
let mut index = 0;
while index < words.len() {
let word = words[index];
if let Some(next) = words.get(index + 1) {
if russian_genitive_relation_head(word) && russian_genitive_noun(next).is_some() {
translated.push(russian_word_to_english(word)?);
translated.push("of");
translated.push(russian_genitive_noun(next)?);
index += 2;
continue;
}
}
translated.push(russian_word_to_english(word)?);
index += 1;
}
Some(capitalize_ascii_first(&translated.join(" ")))
}
fn russian_genitive_relation_head(word: &str) -> bool {
crate::seed::lexicon().role_lists_surface(ROLE_COMPOSITIONAL_GENITIVE_HEAD, "ru", word)
}
fn russian_genitive_noun(word: &str) -> Option<&'static str> {
crate::seed::lexicon().role_action_surface_translation(
ROLE_COMPOSITIONAL_LEMMA,
"genitive",
"ru",
"en",
word,
)
}
fn capitalize_ascii_first(surface: &str) -> String {
let mut chars = surface.chars();
let Some(first) = chars.next() else {
return String::new();
};
let mut out = String::with_capacity(surface.len());
out.extend(first.to_uppercase());
out.extend(chars);
out
}
#[must_use]
pub fn phrasal_variants(page_title: &str, source_lang: &str) -> Vec<String> {
let mut variants: Vec<String> = Vec::new();
if source_lang.eq_ignore_ascii_case("ru") {
let pronouns = [
"у тебя",
"у вас",
"у нас",
"у меня",
"у них",
"у него",
"у неё",
"у нее",
];
for pronoun in &pronouns {
let needle = format!(" {pronoun} ");
if let Some(idx) = page_title.find(&needle) {
let mut alt = String::with_capacity(page_title.len() - needle.len() + 1);
alt.push_str(&page_title[..idx]);
alt.push(' ');
alt.push_str(&page_title[idx + needle.len()..]);
let alt = alt.split_whitespace().collect::<Vec<_>>().join(" ");
if !alt.is_empty() && alt != page_title && !variants.contains(&alt) {
variants.push(alt);
}
}
}
}
variants
}
#[must_use]
pub fn normalize_page_title(surface: &str) -> String {
let trimmed = surface
.trim()
.trim_end_matches(['?', '!', '.', '。', '?', '!', '.']);
let mut chars = trimmed.chars();
let Some(first) = chars.next() else {
return String::new();
};
let mut out = String::with_capacity(trimmed.len());
for character in first.to_lowercase() {
out.push(character);
}
out.extend(chars);
out
}