use super::http::{HttpClient, HttpError};
use super::meaning::MeaningId;
use super::wikidata::Wikidata;
use super::wiktionary::{Wiktionary, WiktionaryCandidate};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Translation {
pub source_surface: String,
pub source_lang: String,
pub target_lang: String,
pub meaning: MeaningId,
pub candidates: Vec<WiktionaryCandidate>,
pub provenance: Vec<String>,
}
impl Translation {
#[must_use]
pub fn primary_surface(&self) -> Option<&str> {
self.candidates
.iter()
.find(|c| c.qualifier.is_none())
.or_else(|| self.candidates.first())
.map(|c| c.surface.as_str())
}
}
pub struct TranslationPipeline<'a, T: HttpClient + ?Sized> {
http: &'a T,
}
impl<'a, T: HttpClient + ?Sized> TranslationPipeline<'a, T> {
pub const fn new(http: &'a T) -> Self {
Self { http }
}
pub fn translate(
&self,
surface: &str,
source_lang: &str,
target_lang: &str,
) -> Result<Translation, HttpError> {
if source_lang == target_lang {
return Ok(Translation {
source_surface: surface.to_owned(),
source_lang: source_lang.to_owned(),
target_lang: target_lang.to_owned(),
meaning: MeaningId::from_wiktionary_page(source_lang, surface),
candidates: vec![WiktionaryCandidate {
surface: surface.to_owned(),
qualifier: None,
}],
provenance: vec!["identity".to_owned()],
});
}
let mut provenance: Vec<String> = Vec::new();
let page_title = normalize_page_title(surface);
let source_wiktionary = Wiktionary::new(source_lang, self.http);
let mut blocks: Vec<Vec<WiktionaryCandidate>> = Vec::new();
let mut meaning = MeaningId::from_wiktionary_page(source_lang, &page_title);
match source_wiktionary.translation_blocks(&page_title, target_lang) {
Ok(found) => {
provenance.push(format!(
"wiktionary:{source_lang}:{page_title}#translations->{target_lang}"
));
if !found.is_empty() {
blocks = found;
}
}
Err(error) => {
provenance.push(format!(
"wiktionary:{source_lang}:{page_title}#translations->error({error})",
));
}
}
if blocks.is_empty() {
let subpage = format!("{page_title}/translations");
match source_wiktionary.translation_blocks(&subpage, target_lang) {
Ok(found) if !found.is_empty() => {
provenance.push(format!(
"wiktionary:{source_lang}:{subpage}#translations->{target_lang}"
));
blocks = found;
}
Ok(_) => {}
Err(error) => {
provenance.push(format!(
"wiktionary:{source_lang}:{subpage}#translations->error({error})",
));
}
}
}
if blocks.is_empty() {
if let Some(reverse) = reverse_lookup(
self.http,
surface,
source_lang,
target_lang,
&mut provenance,
) {
blocks = reverse;
}
}
let mut active_page_title = page_title.clone();
if blocks.is_empty() {
for variant in phrasal_variants(&page_title, source_lang) {
provenance.push(format!("wiktionary:{source_lang}:variant->{variant}"));
match source_wiktionary.translation_blocks(&variant, target_lang) {
Ok(found) if !found.is_empty() => {
provenance.push(format!(
"wiktionary:{source_lang}:{variant}#translations->{target_lang}"
));
blocks = found;
active_page_title.clone_from(&variant);
meaning = MeaningId::from_wiktionary_page(source_lang, &variant);
break;
}
Ok(_) => {}
Err(error) => {
provenance.push(format!(
"wiktionary:{source_lang}:{variant}#translations->error({error})"
));
}
}
if let Some(reverse) = reverse_lookup(
self.http,
&variant,
source_lang,
target_lang,
&mut provenance,
) {
blocks = reverse;
active_page_title.clone_from(&variant);
meaning = MeaningId::from_wiktionary_page(source_lang, &variant);
break;
}
}
}
let candidates = if blocks.is_empty() {
Vec::new()
} else {
select_best_block(
self.http,
&active_page_title,
source_lang,
target_lang,
&mut provenance,
blocks,
)
};
let mut candidates = candidates;
if let Some(updated) = upgrade_meaning_via_wikidata(
self.http,
&active_page_title,
source_lang,
target_lang,
&mut provenance,
&mut candidates,
) {
meaning = updated;
}
Ok(Translation {
source_surface: surface.to_owned(),
source_lang: source_lang.to_owned(),
target_lang: target_lang.to_owned(),
meaning,
candidates,
provenance,
})
}
}
fn reverse_lookup<T: HttpClient + ?Sized>(
http: &T,
surface: &str,
source_lang: &str,
target_lang: &str,
provenance: &mut Vec<String>,
) -> Option<Vec<Vec<WiktionaryCandidate>>> {
let page_title = normalize_page_title(surface);
for edition in [source_lang, target_lang] {
let wiktionary = Wiktionary::new(edition, http);
match wiktionary.translation_blocks(&page_title, target_lang) {
Ok(blocks) if !blocks.is_empty() => {
provenance.push(format!(
"wiktionary:{edition}:{page_title}#reverse->{target_lang}"
));
return Some(blocks);
}
Ok(_) => {}
Err(error) => {
provenance.push(format!(
"wiktionary:{edition}:{page_title}#reverse->error({error})"
));
}
}
}
None
}
fn select_best_block<T: HttpClient + ?Sized>(
http: &T,
page_title: &str,
source_lang: &str,
target_lang: &str,
provenance: &mut Vec<String>,
blocks: Vec<Vec<WiktionaryCandidate>>,
) -> Vec<WiktionaryCandidate> {
let target_wiktionary = Wiktionary::new(target_lang, http);
let mut block_positions: Vec<Vec<Option<usize>>> = Vec::with_capacity(blocks.len());
for (block_idx, block) in blocks.iter().enumerate() {
let mut positions: Vec<Option<usize>> = Vec::with_capacity(block.len());
for candidate in block {
let candidate_page = normalize_page_title(&candidate.surface);
if candidate_page.is_empty() {
positions.push(None);
continue;
}
let Ok(back_blocks) =
target_wiktionary.translation_blocks(&candidate_page, source_lang)
else {
positions.push(None);
continue;
};
let mut within_block_position: Option<usize> = None;
for back_block in &back_blocks {
if let Some(pos) = back_block
.iter()
.position(|row| normalize_page_title(&row.surface) == page_title)
{
within_block_position = Some(pos);
break;
}
}
if let Some(pos) = within_block_position {
provenance.push(format!(
"wiktionary:{target_lang}:{candidate_page}#confirms->{source_lang}:{page_title}@{pos}[block{block_idx}]"
));
}
positions.push(within_block_position);
}
block_positions.push(positions);
}
let mut best_block: usize = 0;
let mut best_confirms: usize = 0;
for (idx, positions) in block_positions.iter().enumerate() {
let confirms = positions.iter().filter(|p| p.is_some()).count();
if confirms > best_confirms {
best_confirms = confirms;
best_block = idx;
}
}
let block = blocks.into_iter().nth(best_block).unwrap_or_default();
let positions = block_positions
.into_iter()
.nth(best_block)
.unwrap_or_default();
let mut indexed: Vec<(usize, Option<usize>, WiktionaryCandidate)> = block
.into_iter()
.zip(positions)
.enumerate()
.map(|(idx, (cand, pos))| (idx, pos, cand))
.collect();
indexed.sort_by_key(|(idx, pos, _)| {
pos.as_ref()
.map_or((1usize, 0, *idx), |p| (0usize, *p, *idx))
});
indexed.into_iter().map(|(_, _, cand)| cand).collect()
}
fn upgrade_meaning_via_wikidata<T: HttpClient + ?Sized>(
http: &T,
page_title: &str,
source_lang: &str,
target_lang: &str,
provenance: &mut Vec<String>,
candidates: &mut Vec<WiktionaryCandidate>,
) -> Option<MeaningId> {
let wikidata = Wikidata::new(http);
let hits = match wikidata.search_lexeme(page_title, source_lang) {
Ok(hits) => hits,
Err(error) => {
provenance.push(format!("wikidata:search->error({error})"));
return None;
}
};
let first = hits.first()?;
provenance.push(format!("wikidata:lexeme:{}", first.id));
let lemmas = match wikidata.lexeme_translations(&first.id, target_lang) {
Ok(rows) => rows,
Err(error) => {
provenance.push(format!("wikidata:sparql->error({error})"));
return Some(MeaningId::from_sense(first.id.clone()));
}
};
if !lemmas.is_empty() {
provenance.push(format!(
"wikidata:sparql:{}->{} ({} lemmas)",
first.id,
target_lang,
lemmas.len()
));
}
for lemma in lemmas {
let candidate = WiktionaryCandidate {
surface: lemma.value,
qualifier: None,
};
if !candidates.iter().any(|c| c.surface == candidate.surface) {
candidates.push(candidate);
}
}
Some(MeaningId::from_sense(first.id.clone()))
}
#[must_use]
pub fn phrasal_variants(page_title: &str, source_lang: &str) -> Vec<String> {
let mut variants: Vec<String> = Vec::new();
if source_lang.eq_ignore_ascii_case("ru") {
let pronouns = [
"у тебя",
"у вас",
"у нас",
"у меня",
"у них",
"у него",
"у неё",
"у нее",
];
for pronoun in &pronouns {
let needle = format!(" {pronoun} ");
if let Some(idx) = page_title.find(&needle) {
let mut alt = String::with_capacity(page_title.len() - needle.len() + 1);
alt.push_str(&page_title[..idx]);
alt.push(' ');
alt.push_str(&page_title[idx + needle.len()..]);
let alt = alt.split_whitespace().collect::<Vec<_>>().join(" ");
if !alt.is_empty() && alt != page_title && !variants.contains(&alt) {
variants.push(alt);
}
}
}
}
variants
}
#[must_use]
pub fn normalize_page_title(surface: &str) -> String {
let trimmed = surface
.trim()
.trim_end_matches(['?', '!', '.', '。', '?', '!', '.']);
let mut chars = trimmed.chars();
let Some(first) = chars.next() else {
return String::new();
};
let mut out = String::with_capacity(trimmed.len());
for character in first.to_lowercase() {
out.push(character);
}
out.extend(chars);
out
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
struct StubHttp {
responses: Mutex<std::collections::HashMap<String, String>>,
}
impl StubHttp {
fn new(pairs: &[(&str, &str)]) -> Self {
Self {
responses: Mutex::new(
pairs
.iter()
.map(|(k, v)| ((*k).to_owned(), (*v).to_owned()))
.collect(),
),
}
}
}
impl HttpClient for StubHttp {
fn get(&self, url: &str) -> Result<String, HttpError> {
self.responses
.lock()
.unwrap()
.get(url)
.cloned()
.ok_or_else(|| HttpError::Status {
status: 404,
body: format!("no stubbed response for {url}"),
})
}
}
#[test]
fn normalize_page_title_strips_terminal_punctuation() {
assert_eq!(normalize_page_title("Hello!"), "hello");
assert_eq!(normalize_page_title("как у тебя дела?"), "как у тебя дела");
assert_eq!(normalize_page_title("你好?"), "你好");
}
#[test]
fn normalize_page_title_lowercases_first_letter() {
assert_eq!(normalize_page_title("Hello"), "hello");
assert_eq!(normalize_page_title("Как дела"), "как дела");
}
#[test]
fn translate_identity_returns_self_with_identity_provenance() {
let http = StubHttp::new(&[]);
let pipeline = TranslationPipeline::new(&http);
let translation = pipeline.translate("hello", "en", "en").unwrap();
assert_eq!(translation.primary_surface(), Some("hello"));
assert_eq!(translation.provenance, vec!["identity".to_owned()]);
}
#[test]
fn translate_uses_source_edition_translation_table() {
let url = "https://en.wiktionary.org/w/api.php?action=parse&page=hello&prop=wikitext&formatversion=2&format=json&redirects=1";
let wikitext = r#"{"parse":{"title":"hello","wikitext":"* Russian: {{t+|ru|привет}}\n"}}"#;
let http = StubHttp::new(&[(url, wikitext)]);
let pipeline = TranslationPipeline::new(&http);
let translation = pipeline.translate("hello", "en", "ru").unwrap();
assert_eq!(translation.primary_surface(), Some("привет"));
assert!(
translation
.provenance
.iter()
.any(|p| p.starts_with("wiktionary:en:hello#translations->ru")),
"got provenance: {:?}",
translation.provenance,
);
}
#[test]
fn translate_returns_translation_with_empty_candidates_when_nothing_matches() {
let http = StubHttp::new(&[]);
let pipeline = TranslationPipeline::new(&http);
let translation = pipeline.translate("xyzzy", "en", "ru").unwrap();
assert!(translation.candidates.is_empty());
assert!(translation.primary_surface().is_none());
assert!(translation.provenance.iter().any(|p| p.contains("error")));
}
#[test]
fn translate_prefers_unqualified_candidate() {
let url = "https://en.wiktionary.org/w/api.php?action=parse&page=hello&prop=wikitext&formatversion=2&format=json&redirects=1";
let wikitext = r#"{"parse":{"wikitext":"* Russian: {{t|ru|здравствуйте|q=formal}}, {{t+|ru|привет|q=informal}}, {{t|ru|здорово}}\n"}}"#;
let http = StubHttp::new(&[(url, wikitext)]);
let pipeline = TranslationPipeline::new(&http);
let translation = pipeline.translate("hello", "en", "ru").unwrap();
assert_eq!(translation.primary_surface(), Some("здорово"));
}
}