use crate::{
Map, Set,
cli::{GlossaryArgs, GlossaryExtendedArgs, IpaArgs, IpaMergedArgs, LangSpecs},
dict::{Dictionary, Langs, main::get_reading},
lang::{Edition, Lang},
models::{
kaikki::WordEntry,
yomitan::{
DetailedDefinition, Ipa, NTag, Node, PhoneticTranscription, TermInfo, TermMeta,
TermPhoneticTranscription, YomitanDict, wrap,
},
},
tags::{Pos, find_tag_in_bank, localize_tag, localize_tag_info},
};
#[derive(Debug, Clone, Copy)]
pub struct DGlossary;
#[derive(Debug, Clone, Copy)]
pub struct DGlossaryExtended;
#[derive(Debug, Clone, Copy)]
pub struct DIpa;
#[derive(Debug, Clone, Copy)]
pub struct DIpaMerged;
impl Dictionary for DGlossary {
type A = GlossaryArgs;
type I = Vec<TermInfo>;
fn process(&self, langs: Langs, entry: &WordEntry, irs: &mut Self::I) {
process_glossary(langs.edition, langs.target, entry, irs);
}
fn to_yomitan(&self, _: LangSpecs, irs: &Self::I) -> YomitanDict {
YomitanDict::new(irs.clone(), vec![], vec![])
}
}
impl Dictionary for DGlossaryExtended {
type A = GlossaryExtendedArgs;
type I = IGlossaryExtended;
fn supports_probe(&self) -> bool {
false
}
fn process(&self, langs: Langs, entry: &WordEntry, irs: &mut Self::I) {
process_glossary_extended(langs.edition, langs.source, langs.target, entry, irs);
}
fn postprocess(&self, irs: &mut Self::I) {
let mut map = Map::default();
for (lemma, pos, edition, translations) in irs.drain(..) {
map.entry(lemma)
.or_insert_with(|| (pos, edition, Set::default()))
.2
.extend(translations);
}
irs.extend(map.into_iter().map(|(lemma, (pos, edition, set))| {
(lemma, pos, edition, set.into_iter().collect::<Vec<_>>())
}));
}
fn to_yomitan(&self, langs: LangSpecs, irs: &Self::I) -> YomitanDict {
YomitanDict::new(
to_yomitan_glossary_extended(langs.target, irs),
vec![],
vec![],
)
}
}
impl Dictionary for DIpa {
type A = IpaArgs;
type I = IIpa;
fn process(&self, langs: Langs, entry: &WordEntry, irs: &mut Self::I) {
process_ipa(langs.edition, langs.source, langs.target, entry, irs);
}
fn to_yomitan(&self, _: LangSpecs, irs: &Self::I) -> YomitanDict {
YomitanDict::new(vec![], vec![], to_yomitan_ipa(irs))
}
}
impl Dictionary for DIpaMerged {
type A = IpaMergedArgs;
type I = IIpa;
fn process(&self, langs: Langs, entry: &WordEntry, irs: &mut Self::I) {
process_ipa(langs.edition, langs.source, langs.target, entry, irs);
}
fn postprocess(&self, irs: &mut Self::I) {
irs.sort_unstable_keys();
}
fn to_yomitan(&self, _: LangSpecs, irs: &Self::I) -> YomitanDict {
YomitanDict::new(vec![], vec![], to_yomitan_ipa(irs))
}
}
fn process_glossary(source: Edition, target: Lang, entry: &WordEntry, irs: &mut Vec<TermInfo>) {
let mut translations: Map<&str, Vec<String>> = Map::default();
for translation in entry.non_trivial_translations() {
if translation.lang_code == target.iso() {
translations
.entry(&translation.sense)
.or_default()
.push(translation.word.clone());
}
}
if translations.is_empty() {
return;
}
let mut definitions = Vec::new();
for (sense, translations) in translations {
if sense.is_empty() {
definitions.extend(translations.into_iter().map(DetailedDefinition::Text));
continue;
}
definitions.push(DetailedDefinition::structured(wrap(
NTag::Div,
"",
Node::Array(vec![
wrap(NTag::Span, "", Node::Text(sense.to_string())),
wrap(
NTag::Ul,
"",
Node::Array(
translations
.into_iter()
.map(|translation| wrap(NTag::Li, "", Node::Text(translation)))
.collect(),
),
),
]),
)));
}
let reading = get_reading(source, target, entry).unwrap_or_default();
let definition_tags = match find_tag_in_bank(&entry.pos) {
Some(mut tag_info) => {
localize_tag_info(target, &mut tag_info);
vec![tag_info]
}
None => vec![],
};
let pos = Pos::from(entry.pos.as_str());
let rules = pos.short();
irs.push(TermInfo::new(
entry.word.clone(),
reading,
definition_tags,
rules.to_string(),
definitions,
));
}
type IGlossaryExtended = Vec<(String, Pos, Edition, Vec<String>)>;
fn process_glossary_extended(
edition: Edition,
source: Lang,
target: Lang,
entry: &WordEntry,
irs: &mut IGlossaryExtended,
) {
let mut translations: Map<&str, (Vec<&str>, Vec<&str>)> = Map::default();
for translation in entry.non_trivial_translations() {
if translation.lang_code == target.iso() {
translations
.entry(&translation.sense)
.or_default()
.0
.push(&translation.word);
}
if translation.lang_code == source.iso() {
translations
.entry(&translation.sense)
.or_default()
.1
.push(&translation.word);
}
}
translations.retain(|_, (targets, sources)| !targets.is_empty() && !sources.is_empty());
if translations.is_empty() {
return;
}
irs.extend(translations.iter().flat_map(|(_, (targets, sources))| {
sources.iter().map(|lemma| {
(
(*lemma).to_string(),
Pos::from(entry.pos.as_str()),
edition,
targets.iter().map(|def| (*def).to_string()).collect(),
)
})
}));
}
fn to_yomitan_glossary_extended(target: Lang, irs: &IGlossaryExtended) -> Vec<TermInfo> {
irs.iter()
.map(|(lemma, pos, _, translations)| {
let definition_tags = match find_tag_in_bank(pos.long()) {
Some(mut tag_info) => {
localize_tag_info(target, &mut tag_info);
vec![tag_info]
}
None => vec![],
};
let rules = pos.short();
TermInfo::new(
lemma.clone(),
String::new(),
definition_tags,
rules.to_string(),
translations
.iter()
.cloned()
.map(DetailedDefinition::Text)
.collect(),
)
})
.collect()
}
fn normalize_ipa(text: &str) -> String {
let fst = text.chars().next();
let lst = text.chars().last();
match (fst, lst) {
(Some('['), Some(']')) => text.to_string(),
(Some('/'), Some('/')) => text.to_string(),
(Some('\\'), Some('\\')) if text.len() > 1 => format!("/{}/", &text[1..text.len() - 1]),
_ => format!("/{text}/"),
}
}
fn ipa_inner(text: &str) -> &str {
let fst = text.chars().next();
let lst = text.chars().last();
match (fst, lst) {
(Some('['), Some(']')) => &text[1..text.len() - 1],
(Some('/'), Some('/')) | (Some('\\'), Some('\\')) if text.len() > 1 => {
&text[1..text.len() - 1]
}
_ => text,
}
}
fn is_phonetic(text: &str) -> bool {
text.starts_with('[') && text.ends_with(']')
}
fn get_ipas(entry: &WordEntry) -> Vec<Ipa> {
entry
.sounds
.iter()
.filter_map(|sound| {
if sound.ipa.is_empty() {
return None;
}
let mut tags = sound.tags.clone();
if !sound.note.is_empty() {
tags.push(sound.note.clone());
}
Some(Ipa {
ipa: normalize_ipa(&sound.ipa),
tags,
})
})
.collect()
}
type IIpa = Map<(String, String), Vec<Ipa>>;
fn process_ipa(edition: Edition, source: Lang, target: Lang, entry: &WordEntry, irs: &mut IIpa) {
let mut ipas = get_ipas(entry);
if ipas.is_empty() {
return;
}
for ipa in &mut ipas {
for tag in &mut ipa.tags {
if let Some(tag_info) = find_tag_in_bank(tag) {
*tag = match localize_tag(target, &tag_info.short_tag) {
Some((short, _)) => {
short.to_string()
}
None => (*tag_info.short_tag).to_string(),
}
}
}
}
let reading = get_reading(edition, source, entry).unwrap_or_else(|| entry.word.clone());
let existing = irs.entry((entry.word.clone(), reading)).or_default();
for ipa in ipas {
let inner = ipa_inner(&ipa.ipa);
if let Some(existing_ipa) = existing.iter_mut().find(|e| ipa_inner(&e.ipa) == inner) {
if is_phonetic(&ipa.ipa) {
existing_ipa.ipa = ipa.ipa;
}
for tag in ipa.tags {
if !existing_ipa.tags.contains(&tag) {
existing_ipa.tags.push(tag);
}
}
} else {
existing.push(ipa);
}
}
}
fn to_yomitan_ipa(irs: &IIpa) -> Vec<TermMeta> {
irs.into_iter()
.map(|((lemma, reading), transcriptions)| {
TermMeta::TermPhoneticTranscription(TermPhoneticTranscription::new(
lemma.clone(),
PhoneticTranscription {
reading: reading.clone(),
transcriptions: transcriptions.clone(),
},
))
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::kaikki::{Sound, Translation};
impl Translation {
fn new(lang_code: &str, sense: &str, word: &str) -> Self {
Self {
lang_code: lang_code.into(),
sense: sense.into(),
word: word.into(),
}
}
}
#[test]
fn process_glossary_extended_basic() {
let dict = DGlossaryExtended;
let langs = Langs::new(Edition::En, Lang::Grc, Lang::Sh);
let mut entry = WordEntry::default();
entry.pos = "noun".to_string();
entry.translations = vec![
Translation::new("grc", "British overseas territory", "Ἡράκλειαι στῆλαι"),
Translation::new("grc", "British overseas territory", "Ἡράκλειαι στῆλαι"),
Translation::new("grc", "British overseas territory", "Κάλπη"),
Translation::new("sh", "British overseas territory", "Gibraltar"),
Translation::new("sh", "British overseas territory", "Gjibraltari"),
Translation::new("sh", "Different sense", "Foo"),
];
let mut irs = Vec::new();
dict.process(langs, &entry, &mut irs);
let entry = WordEntry::default();
dict.process(langs, &entry, &mut irs);
assert_eq!(irs.len(), 3);
let (lemma1, pos, _, defs1) = &irs[0];
let (lemma2, _, _, defs2) = &irs[1];
let (lemma3, _, _, defs3) = &irs[2];
assert_eq!(pos.long(), "noun");
assert_eq!(lemma1, "Ἡράκλειαι στῆλαι");
assert_eq!(lemma2, "Ἡράκλειαι στῆλαι");
assert_eq!(lemma3, "Κάλπη");
let expected = vec!["Gibraltar".to_string(), "Gjibraltari".to_string()];
assert_eq!(defs1, &expected);
assert_eq!(defs2, &expected);
assert_eq!(defs3, &expected);
dict.postprocess(&mut irs);
assert_eq!(irs.len(), 2);
let yomitan_entries = to_yomitan_glossary_extended(Lang::Grc, &irs);
assert_eq!(yomitan_entries.len(), 2);
let term_bank = yomitan_entries.first().unwrap();
assert_eq!(term_bank.definition_tags[0].short_tag, "n");
}
#[test]
fn process_glossary_extended_pos_localization() {
let dict = DGlossaryExtended;
let langs = Langs::new(Edition::En, Lang::Ja, Lang::En);
let mut entry = WordEntry::default();
entry.pos = "noun".to_string();
entry.translations = vec![
Translation::new("ja", "some sense", "日本語"),
Translation::new("en", "some sense", "english"),
];
let mut irs = IGlossaryExtended::new();
dict.process(langs, &entry, &mut irs);
assert_eq!(irs.len(), 1);
let (_, pos, _, _) = &irs[0];
assert_eq!(pos.long(), "noun");
let yomitan_entries = to_yomitan_glossary_extended(Lang::Ja, &irs);
let term_bank = yomitan_entries.first().unwrap();
assert_eq!(term_bank.definition_tags[0].short_tag, "名");
}
impl Sound {
fn new(ipa: &str) -> Self {
Self {
ipa: ipa.into(),
..Default::default()
}
}
fn with_tag(ipa: &str, tag: &str) -> Self {
Self {
ipa: ipa.into(),
tags: vec![tag.into()],
..Default::default()
}
}
}
#[test]
fn process_ipa_merged_basic() {
let dict = DIpaMerged;
let langs = Langs::new(Edition::En, Lang::Grc, Lang::Sh);
let mut entry = WordEntry::default();
entry.sounds = vec![
Sound::new("/ipa1/"), Sound::new("[ipa1]"),
Sound::new("/ipa1/"), Sound::new("[ipa2]"),
];
let mut irs = IIpa::default();
dict.process(langs, &entry, &mut irs);
assert_eq!(irs.len(), 1);
let transcriptions = irs.values().next().unwrap();
assert_eq!(transcriptions.len(), 2);
assert_eq!(&transcriptions[0].ipa, "[ipa1]");
assert_eq!(&transcriptions[1].ipa, "[ipa2]");
}
#[test]
fn process_ipa_tag() {
let dict = DIpa;
let langs = Langs::new(Edition::En, Lang::La, Lang::La); let mut entry = WordEntry::default();
entry.sounds = vec![
Sound::with_tag("[ipa1]", "tag1"),
Sound::with_tag("[ipa2]", "modern Italianate Ecclesiastical"),
];
let mut irs = IIpa::default();
dict.process(langs, &entry, &mut irs);
assert_eq!(irs.len(), 1);
let transcriptions = irs.values().next().unwrap();
assert_eq!(transcriptions.len(), 2);
assert_eq!(&transcriptions[0].ipa, "[ipa1]");
assert_eq!(&transcriptions[1].ipa, "[ipa2]");
assert_eq!(&transcriptions[0].tags[0], "tag1");
assert_eq!(&transcriptions[1].tags[0], "⛪");
}
#[test]
fn process_ipa_merged_tag_merge() {
let dict = DIpaMerged;
let (source, target) = (Lang::La, Lang::La);
let edition = Edition::En;
let langs = Langs::new(edition, source, target);
let mut entry = WordEntry::default();
entry.sounds = vec![Sound::with_tag("ipa1", "tag1")];
let mut irs = IIpa::default();
dict.process(langs, &entry, &mut irs);
let edition = Edition::De;
let langs = Langs::new(edition, source, target);
let mut entry = WordEntry::default();
entry.sounds = vec![Sound::with_tag("ipa1", "tag2")];
dict.process(langs, &entry, &mut irs);
let transcriptions = irs.values().next().unwrap();
assert_eq!(transcriptions.len(), 1);
assert!(transcriptions[0].tags.contains(&"tag1".to_string()));
assert!(transcriptions[0].tags.contains(&"tag2".to_string()));
}
#[test]
fn process_ipa_merged_postprocess_order() {
let dict = DIpaMerged;
let (source, target) = (Lang::La, Lang::La);
let langs = Langs::new(Edition::En, source, target);
let mut irs = IIpa::default();
let mut entry = WordEntry::default();
entry.word = "zebra".to_string();
entry.sounds = vec![Sound::new("ipa1")];
dict.process(langs, &entry, &mut irs);
let mut entry = WordEntry::default();
entry.word = "apple".to_string();
entry.sounds = vec![Sound::new("ipa2")];
dict.process(langs, &entry, &mut irs);
dict.postprocess(&mut irs);
let keys: Vec<&String> = irs.keys().map(|(word, _)| word).collect();
assert_eq!(keys, vec!["apple", "zebra"]);
}
}