use std::{fs::File, io::BufWriter, sync::LazyLock};
use anyhow::Result;
use indexmap::map::Entry;
use regex::Regex;
use serde::{Deserialize, Serialize};
use unicode_normalization::UnicodeNormalization;
mod heap;
use heap::HeapSize;
use crate::{
Map, Set,
cli::{LangSpecs, Options},
dict::Intermediate,
lang::{Edition, Lang},
models::kaikki::{Example, Form, HeadTemplate, Pos, Sense, Synonym, Tag, WordEntry},
path::PathManager,
tags::{
REDUNDANT_FORM_TAGS, merge_case_tags, merge_person_tags, merge_verb_form_tags,
remove_redundant_tags, sort_tags, sort_tags_by_similar,
},
utils::{human_size, link_kaikki, link_wiktionary, pretty_println_at_path},
};
const MAX_NUMBER_OF_SYNONYMS: usize = 3;
const MAX_NUMBER_OF_EXAMPLES: usize = 3;
const MAX_SIZE_OF_EXAMPLE: usize = 120;
const MAX_SIZE_OF_EXAMPLE_REFERENCE: usize = 120;
#[derive(Debug, Default)]
pub struct Tidy {
pub lemma_map: LemmaMap, pub form_map: FormMap, }
impl Intermediate for Tidy {
fn len(&self) -> usize {
self.len()
}
fn write(&self, pm: &PathManager) -> Result<()> {
self.write(pm)
}
}
impl Tidy {
fn len(&self) -> usize {
self.lemma_map.len() + self.form_map.len()
}
fn insert_lemma(&mut self, lemma: &str, reading: &str, pos: &str, entry: LemmaInfo) {
debug_assert!(!entry.gloss_tree.is_empty());
let key = LemmaKey {
lemma: lemma.into(),
reading: reading.into(),
pos: pos.into(),
};
match self.lemma_map.0.entry(key) {
Entry::Vacant(e) => {
e.insert(vec![entry]);
}
Entry::Occupied(mut e) => {
e.get_mut().push(entry);
}
}
}
fn insert_form(
&mut self,
uninflected: &str,
inflected: &str,
pos: &str,
source: FormSource,
tags: Vec<Tag>,
) {
if tags.is_empty()
|| uninflected.is_empty()
|| inflected.is_empty()
|| uninflected == inflected
{
return;
}
let key = FormKey {
uninflected: uninflected.into(),
inflected: inflected.into(),
pos: pos.into(),
};
match self.form_map.0.entry(key) {
Entry::Vacant(e) => {
e.insert((source, tags));
}
Entry::Occupied(mut e) => {
e.get_mut().1.extend(tags);
}
}
}
#[tracing::instrument(skip_all)]
fn write(&self, pm: &PathManager) -> Result<()> {
let opath = pm.path_lemmas();
let file = File::create(&opath)?;
let writer = BufWriter::new(file);
if pm.opts.pretty {
serde_json::to_writer_pretty(writer, &self.lemma_map)?;
} else {
serde_json::to_writer(writer, &self.lemma_map)?;
}
if !pm.opts.quiet {
pretty_println_at_path("Wrote tidy lemmas", &opath);
}
let opath = pm.path_forms();
let file = File::create(&opath)?;
let writer = BufWriter::new(file);
if pm.opts.pretty {
serde_json::to_writer_pretty(writer, &self.form_map)?;
} else {
serde_json::to_writer(writer, &self.form_map)?;
}
if !pm.opts.quiet {
pretty_println_at_path("Wrote tidy forms", &opath);
}
Ok(())
}
}
pub(crate) fn postprocess_main(irs: &mut Tidy) {
postprocess_forms(&mut irs.form_map);
}
#[allow(unused)]
fn check_orphaned_redirects(irs: &mut Tidy) {
let mut orphaned_count = 0;
let total = irs.form_map.len();
let lemmas_found: Set<_> = irs
.lemma_map
.0
.iter()
.map(|(key, _)| key.lemma.as_str())
.collect();
for (uninfl, _, _, _, _) in irs.form_map.flat_iter() {
if !lemmas_found.contains(uninfl) {
orphaned_count += 1;
}
}
tracing::error!("{orphaned_count} orphaned_count from {total}");
}
pub(crate) fn found_ir_message_impl(langs: LangSpecs, irs: &Tidy) {
let n_lemmas = irs.lemma_map.len();
let n_forms = irs.form_map.len();
let n_irs = n_lemmas + n_forms;
let n_forms_inflection = irs.form_map.len_inflection();
let n_forms_extracted = irs.form_map.len_extracted();
let n_forms_alt_of = irs.form_map.len_alt_of();
debug_assert_eq!(
n_forms,
n_forms_inflection + n_forms_extracted + n_forms_alt_of,
"mismatch in form counts"
);
let lemma_heap = irs.lemma_map.heap_size() as f64;
let form_heap = irs.form_map.heap_size() as f64;
let irs_heap = lemma_heap + form_heap;
let lemma_heap_msg = human_size(lemma_heap);
let form_heap_msg = human_size(form_heap);
let irs_heap_msg = human_size(irs_heap);
const MB: f64 = 1024.0 * 1024.0;
if irs_heap > 500.0 * MB {
tracing::debug!(
"[{}-{}] Found {} irs ({})",
langs.source,
langs.target,
n_irs,
irs_heap_msg,
);
tracing::debug!("├─ lemmas: {} ({})", n_lemmas, lemma_heap_msg,);
tracing::debug!(
"└─ forms : {} ({}) [infl {}, extr {}, alt {}]",
n_forms,
form_heap_msg,
n_forms_inflection,
n_forms_extracted,
n_forms_alt_of,
);
} else {
tracing::debug!(
"Found {n_irs} irs: {n_lemmas} lemmas, {n_forms} forms \
[{n_forms_inflection} infl, {n_forms_extracted} extr, {n_forms_alt_of} alt]"
);
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
struct LemmaKey {
lemma: String,
reading: String,
pos: Pos,
}
#[derive(Debug, Default)]
pub struct LemmaMap(Map<LemmaKey, Vec<LemmaInfo>>);
impl Serialize for LemmaMap {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let mut nested: Map<&str, Map<&str, Map<&str, &Vec<LemmaInfo>>>> = Map::default();
for (key, infos) in &self.0 {
nested
.entry(&key.lemma)
.or_default()
.entry(&key.reading)
.or_default()
.insert(&key.pos, infos);
}
nested.serialize(serializer)
}
}
impl LemmaMap {
pub fn into_flat_iter(self) -> impl Iterator<Item = (String, String, Pos, LemmaInfo)> {
self.0.into_iter().flat_map(|(key, infos)| {
let lemma = key.lemma;
let reading = key.reading;
let pos = key.pos;
infos
.into_iter()
.map(move |info| (lemma.clone(), reading.clone(), pos.clone(), info))
})
}
fn len(&self) -> usize {
self.0.values().map(Vec::len).sum()
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct FormKey {
uninflected: String,
inflected: String,
pos: Pos,
}
#[derive(Debug, Default)]
pub struct FormMap(Map<FormKey, (FormSource, Vec<String>)>);
impl Serialize for FormMap {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
#[expect(clippy::type_complexity)]
let mut nested: Map<&str, Map<&str, Map<&str, &(FormSource, Vec<String>)>>> =
Map::default();
for (key, infos) in &self.0 {
nested
.entry(&key.uninflected)
.or_default()
.entry(&key.inflected)
.or_default()
.insert(&key.pos, infos);
}
nested.serialize(serializer)
}
}
impl FormMap {
pub fn flat_iter(&self) -> impl Iterator<Item = (&str, &str, &str, &FormSource, &Vec<String>)> {
self.0.iter().map(|(key, (source, tags))| {
(
key.uninflected.as_str(),
key.inflected.as_str(),
key.pos.as_str(),
source,
tags,
)
})
}
pub fn into_flat_iter(
self,
) -> impl Iterator<Item = (String, String, String, FormSource, Vec<String>)> {
self.0
.into_iter()
.map(|(key, (source, tags))| (key.uninflected, key.inflected, key.pos, source, tags))
}
pub fn flat_iter_mut(
&mut self,
) -> impl Iterator<Item = (&str, &str, &str, &mut FormSource, &mut Vec<String>)> {
self.0.iter_mut().map(|(key, (source, tags))| {
(
key.uninflected.as_str(),
key.inflected.as_str(),
key.pos.as_str(),
source,
tags,
)
})
}
fn len(&self) -> usize {
self.flat_iter().count()
}
fn len_of(&self, source: FormSource) -> usize {
self.flat_iter()
.filter(|(_, _, _, src, _)| **src == source)
.count()
}
fn len_extracted(&self) -> usize {
self.len_of(FormSource::Extracted)
}
fn len_inflection(&self) -> usize {
self.len_of(FormSource::Inflection)
}
fn len_alt_of(&self) -> usize {
self.len_of(FormSource::AltOfTop) + self.len_of(FormSource::AltOfSense)
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum FormSource {
Extracted,
Inflection,
AltOfTop,
AltOfSense,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct LemmaInfo {
pub gloss_tree: GlossTree,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<Tag>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub synonyms: Vec<Synonym>,
#[serde(skip_serializing_if = "Option::is_none")]
pub etymology_text: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub head_info_text: Option<String>,
#[serde(rename = "wlink")]
pub link_wiktionary: String,
#[serde(rename = "klink")]
pub link_kaikki: String,
}
pub type GlossTree = Map<String, GlossInfo>;
#[derive(Debug, Serialize, Deserialize, Default, Clone)]
#[serde(default)]
pub struct GlossInfo {
#[serde(skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<Tag>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub topics: Vec<Tag>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub examples: Vec<Example>,
#[serde(skip_serializing_if = "Map::is_empty")]
pub children: GlossTree,
}
fn postprocess_forms(form_map: &mut FormMap) {
for (_, _, _, _, tags) in form_map.flat_iter_mut() {
remove_redundant_tags(tags);
merge_person_tags(tags);
merge_case_tags(tags);
merge_verb_form_tags(tags);
for tag in tags.iter_mut() {
let mut words: Vec<&str> = tag.split(' ').collect();
sort_tags(&mut words);
*tag = words.join(" ");
}
sort_tags_by_similar(tags);
}
}
pub(crate) fn process_main(edition: Edition, source: Lang, entry: &WordEntry, irs: &mut Tidy) {
process_forms(edition, source, entry, irs);
process_alt_forms(entry, irs);
if entry.contains_no_gloss() {
process_no_gloss(edition, entry, irs);
} else {
irs.insert_lemma(
&entry.word,
&get_reading(edition, source, entry).unwrap_or_else(|| entry.word.clone()),
&entry.pos,
process_entry(edition, source, entry),
);
}
}
pub(crate) fn should_skip_entry(entry: &WordEntry) -> bool {
entry.pos == "romanization"
}
pub(crate) fn preprocess_main(
edition: Edition,
source: Lang,
opts: &Options,
entry: &mut WordEntry,
irs: &mut Tidy,
) {
match edition {
Edition::En => {
if let Some(cform) = entry.canonical_form() {
let cform_tags: Vec<_> = cform.tags.clone();
for sense in &mut entry.senses {
for tag in &cform_tags {
if tag != "canonical" && !sense.tags.contains(tag) {
sense.tags.push(tag.into());
}
}
}
}
}
Edition::El => {
let gender_tags = ["masculine", "feminine", "neuter"];
for form in &entry.forms {
if form.form == entry.word {
for sense in &mut entry.senses {
for tag in &form.tags {
if gender_tags.contains(&tag.as_str()) && !sense.tags.contains(tag) {
sense.tags.push(tag.into());
}
}
}
}
}
}
_ => (),
}
if edition == Edition::It {
for sense in &mut entry.senses {
sense
.glosses
.retain(|gloss| *gloss != "definizione mancante; se vuoi, aggiungila tu");
}
}
if edition == Edition::De && source == Lang::De && entry.pos == "verb" {
preprocess_forms_de(entry);
}
let old_senses = std::mem::take(&mut entry.senses);
let mut senses_without_inflections = Vec::new();
for sense in old_senses {
if is_inflection_sense(edition, &sense)
&& (!opts.experimental || entry.non_trivial_forms().next().is_none())
{
handle_inflection_sense(edition, source, entry, &sense, irs);
} else if !sense.alt_of.is_empty() {
handle_alt_of_sense(entry, &sense, irs);
} else {
senses_without_inflections.push(sense);
}
}
entry.senses = senses_without_inflections;
if opts.experimental {
static TRAILING_PUNCT_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\p{P}$").unwrap());
for sense in &mut entry.senses {
for gloss in &mut sense.glosses {
if !TRAILING_PUNCT_RE.is_match(gloss) {
gloss.push(' ');
}
}
}
}
}
fn preprocess_forms_de(entry: &mut WordEntry) {
const PRONOUNS: &[&str] = &["ich ", "du ", "er/sie/es ", "wir ", "ihr ", "sie "];
for form in &mut entry.forms {
for &prefix in PRONOUNS {
if let Some(stripped) = form.form.strip_prefix(prefix) {
form.form = stripped.to_string();
break;
}
}
}
entry.forms.retain(|form| {
let is_compound = form.tags.iter().any(|tag| {
matches!(
tag.as_str(),
"perfect"
| "pluperfect"
| "future-i"
| "future-ii"
| "processual-passive"
| "statal-passive"
)
});
!is_compound && !form.form.ends_with(['…', '!'])
&& !form.form.contains(',')
});
for form in &mut entry.forms {
if let Some(stripped) = form.form.strip_prefix("zu ") {
form.form = stripped.to_string();
}
if form.tags.iter().any(|tag| tag == "extended")
&& let Some(stripped) = form.form.strip_suffix(" zu haben")
{
form.form = stripped.to_string();
}
}
}
fn process_forms(edition: Edition, source: Lang, entry: &WordEntry, irs: &mut Tidy) {
for form in entry.non_trivial_forms() {
debug_assert_ne!(form.form, entry.word);
if should_break_at_finish_forms(edition, source, form) {
break;
}
if should_skip_form(edition, source, &entry.pos, form) {
continue;
}
let filtered_tags = form
.tags
.iter()
.map(String::as_str)
.filter(|tag| !REDUNDANT_FORM_TAGS.contains(tag))
.collect::<Vec<_>>()
.join(" ");
irs.insert_form(
&entry.word,
&form.form,
&entry.pos,
FormSource::Extracted,
vec![filtered_tags],
);
}
}
fn should_skip_form(edition: Edition, source: Lang, pos: &str, form: &Form) -> bool {
match (edition, source) {
(Edition::Fr, Lang::Fr) => {
if ["qu’", "que ", "il/elle/on", "ils/elles", "en "]
.iter()
.any(|p| form.form.starts_with(p))
{
return true;
}
if form.tags.iter().any(|tag| tag == "pluperfect") {
return true;
}
}
(Edition::En, Lang::Ja) => {
if is_japanese_romanization(&form.form) {
return true;
}
if form.form.contains('[') && form.form.contains(']') {
return true;
}
}
(Edition::En, Lang::En) => {
if form
.tags
.iter()
.any(|tag| tag == "rare" || tag == "nonstandard" || tag == "dialectal")
{
return true;
}
if form.form.contains(' ')
&& form
.tags
.iter()
.any(|tag| tag == "comparative" || tag == "superlative")
{
return true;
}
}
(Edition::En, Lang::Fi) => {
if form
.tags
.iter()
.any(|tag| tag == "rare" || tag == "possessive")
{
return true;
}
if form.form.contains(' ') {
return true;
}
}
(Edition::Ja, Lang::Ja) => {
if pos == "noun"
&& !form
.tags
.iter()
.any(|tag| tag == "transliteration" || tag == "kanji")
{
return true;
}
}
_ => (),
}
false
}
fn is_japanese_romanization(form: &str) -> bool {
form.chars()
.all(|c| c.is_ascii() || matches!(c, 'ā' | 'ī' | 'ū' | 'ē' | 'ō'))
}
fn should_break_at_finish_forms(edition: Edition, source: Lang, form: &Form) -> bool {
if matches!((edition, source), (Edition::En, Lang::Fi)) {
if form.form == "See the possessive forms below." {
return true;
}
if form.form == "Rare. Only used with substantive adjectives." {
return true;
}
}
false
}
fn process_alt_forms(entry: &WordEntry, irs: &mut Tidy) {
for alt_form in &entry.alt_of {
irs.insert_form(
&entry.word,
&alt_form.word,
&entry.pos,
FormSource::AltOfTop,
vec!["alt-of".to_string()],
);
}
}
#[expect(clippy::single_match)]
fn process_no_gloss(edition: Edition, entry: &WordEntry, irs: &mut Tidy) {
match edition {
Edition::El => {
if entry.is_participle()
&& let Some(form_of) = entry.form_of.first()
{
irs.insert_form(
&form_of.word,
&entry.word,
&entry.pos,
FormSource::Inflection,
vec![format!("redirected from {}", entry.word)],
);
}
}
_ => (),
}
}
pub fn get_reading(edition: Edition, source: Lang, entry: &WordEntry) -> Option<String> {
match (edition, source) {
(Edition::En, Lang::Ja) => get_japanese_reading(entry),
(Edition::En, Lang::Fa) => entry.romanization_form().map(|f| f.form.clone()),
(Edition::Ja, _) => entry.transliteration_form().map(|f| f.form.clone()),
(Edition::En | Edition::Zh, Lang::Zh) => entry.pinyin().map(String::from),
_ => get_canonical_word(source, entry),
}
}
fn get_canonical_word(source: Lang, entry: &WordEntry) -> Option<String> {
match source {
Lang::La | Lang::Ru | Lang::Grc | Lang::Ar | Lang::Fa => {
entry.canonical_form().map(|f| f.form.to_string())
}
_ => None,
}
}
fn get_japanese_reading(entry: &WordEntry) -> Option<String> {
if let Some(cform) = entry.canonical_form()
&& !cform.ruby.is_empty()
{
let mut cform_lemma = entry.word.clone();
let mut cursor = 0;
for (base, reading) in &cform.ruby {
if let Some(pos) = cform_lemma[cursor..].find(base) {
let start = cursor + pos;
let end = start + base.len();
cform_lemma.replace_range(start..end, reading);
cursor = start + reading.len();
} else {
tracing::warn!("Kanji '{}' not found in '{}'", base, cform_lemma);
return None;
}
}
return Some(cform_lemma);
}
None
}
fn process_entry(edition: Edition, source: Lang, entry: &WordEntry) -> LemmaInfo {
LemmaInfo {
gloss_tree: get_gloss_tree(entry),
tags: entry.tags.clone(),
synonyms: entry
.synonyms
.iter()
.filter(|syn| syn.word != entry.word)
.take(MAX_NUMBER_OF_SYNONYMS)
.cloned()
.collect(),
etymology_text: entry
.etymology_texts()
.filter(|texts| match edition {
Edition::Ru => !texts.contains(&"Происходит от ??"),
_ => true,
})
.map(|etymology_text| etymology_text.join("\n")),
head_info_text: get_head_info(&entry.head_templates)
.map(|head_info_text| head_info_text.join("\n")),
link_wiktionary: link_wiktionary(edition, source, &entry.word),
link_kaikki: link_kaikki(edition, source, &entry.word),
}
}
static PARENS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(.+?\)").unwrap());
fn get_head_info(head_templates: &[HeadTemplate]) -> Option<Vec<&str>> {
let mut seen = Set::default();
let result: Vec<_> = head_templates
.iter()
.filter_map(|head_template| {
let expansion = head_template.expansion.as_str();
if PARENS_RE.is_match(expansion) && seen.insert(expansion) {
Some(expansion)
} else {
None
}
})
.collect();
if result.is_empty() {
None
} else {
Some(result)
}
}
fn get_gloss_tree(entry: &WordEntry) -> GlossTree {
let mut gloss_tree = GlossTree::default();
for sense in &entry.senses {
let mut filtered_examples: Vec<_> = sense
.examples
.iter()
.filter(|ex| !ex.text.is_empty() && ex.text.chars().count() <= MAX_SIZE_OF_EXAMPLE)
.take(MAX_NUMBER_OF_EXAMPLES)
.cloned()
.map(|mut ex| {
if ex.reference.chars().count() > MAX_SIZE_OF_EXAMPLE_REFERENCE {
ex.reference = String::new();
}
ex
})
.collect();
filtered_examples.sort_by_key(|ex| ex.translation.is_empty());
insert_glosses(
&mut gloss_tree,
&sense.glosses,
&sense.tags,
&sense.topics,
&filtered_examples,
);
}
gloss_tree
}
fn insert_glosses(
gloss_tree: &mut GlossTree,
glosses: &[String],
tags: &[Tag],
topics: &[Tag],
examples: &[Example],
) {
let Some(head) = glosses.first() else {
return;
};
let tail = &glosses[1..];
let node = gloss_tree.entry(head.clone()).or_insert_with(|| GlossInfo {
tags: tags.to_vec(),
topics: topics.to_vec(),
..Default::default()
});
if !node.tags.is_empty() {
node.tags = tags
.iter()
.filter(|&t| node.tags.contains(t))
.cloned()
.collect();
}
if tail.is_empty() {
node.examples = examples.to_vec();
return;
}
insert_glosses(&mut node.children, tail, tags, topics, examples);
}
static DE_INFLECTION_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"^(.*)des (?:Verbs|Adjektivs|Substantivs|Demonstrativpronomens|Possessivpronomens|Pronomens) (.*)$"
).unwrap()
});
fn is_inflection_sense(edition: Edition, sense: &Sense) -> bool {
match edition {
Edition::De => sense
.glosses
.iter()
.any(|gloss| DE_INFLECTION_RE.is_match(gloss)),
Edition::El => {
!sense.form_of.is_empty() && sense.glosses.iter().any(|gloss| gloss.contains("του"))
}
Edition::En => {
sense.glosses.iter().any(|gloss| {
if gloss.contains("inflection of") {
return true;
}
for form in &sense.form_of {
if form.word.is_empty() {
continue;
}
let subs = format!("of {}", form.word);
if gloss.ends_with(&subs)
|| (gloss.contains(&format!("{subs} (")) && gloss.ends_with(')'))
{
return true;
}
}
false
})
}
Edition::Fr | Edition::It | Edition::Ja => {
sense.form_of.len() == 1
}
_ => false,
}
}
const TAGS_RETAINED_EL: [&str; 9] = [
"masculine",
"feminine",
"neuter",
"singular",
"plural",
"nominative",
"accusative",
"genitive",
"vocative",
];
fn handle_inflection_sense(
edition: Edition,
source: Lang,
entry: &WordEntry,
sense: &Sense,
irs: &mut Tidy,
) {
debug_assert!(!sense.glosses.is_empty());
match edition {
Edition::De => {
if let Some(caps) = DE_INFLECTION_RE.captures(&sense.glosses[0])
&& let (Some(inflection_tags), Some(uninflected)) = (caps.get(1), caps.get(2))
{
let inflection_tags = inflection_tags.as_str().trim();
irs.insert_form(
uninflected.as_str(),
&entry.word,
&entry.pos,
FormSource::Inflection,
vec![inflection_tags.to_string()],
);
}
}
Edition::El => {
let allowed_tags: Vec<_> = sense
.tags
.iter()
.filter(|tag| TAGS_RETAINED_EL.contains(&tag.as_str()))
.map(String::from)
.collect();
let inflection_tags: Vec<_> = if allowed_tags.is_empty() {
vec![format!("redirected from {}", entry.word)]
} else {
allowed_tags
};
for form in &sense.form_of {
irs.insert_form(
&form.word,
&entry.word,
&entry.pos,
FormSource::Inflection,
inflection_tags.clone(),
);
}
}
Edition::En => handle_inflection_sense_en(source, entry, sense, irs),
Edition::Fr | Edition::It | Edition::Ja => {
match sense.form_of.as_slice() {
[form_of] => {
debug_assert!(
sense.tags.iter().any(|tag| *tag == "form-of")
|| entry.tags.iter().any(|tag| *tag == "form-of")
);
let allowed_tags: Vec<_> = sense
.tags
.iter()
.filter(|tag| *tag != "form-of")
.map(String::from)
.collect();
let inflection_tags: Vec<_> = if allowed_tags.is_empty() {
vec![format!("redirected from {}", entry.word)]
} else {
allowed_tags
};
let norm_form_of_word = normalize_orthography(source, &form_of.word);
irs.insert_form(
&norm_form_of_word,
&entry.word,
&entry.pos,
FormSource::Inflection,
inflection_tags,
);
}
_ => unreachable!(),
}
}
_ => unreachable!("Unhandled lang that implements is_inflection_sense"),
}
}
fn handle_inflection_sense_en(source: Lang, entry: &WordEntry, sense: &Sense, irs: &mut Tidy) {
let uninflected = match sense.form_of.as_slice() {
[alt_form] => &alt_form.word,
_ => return,
};
let inflected = get_canonical_word(source, entry).unwrap_or_else(|| entry.word.clone());
if inflected == *uninflected {
return;
}
let mut inflections = Set::default();
let of_uninflected = format!("of {uninflected}");
for gloss in &sense.glosses {
let cleaned = gloss
.replace("inflection of ", "")
.replace(&of_uninflected, "")
.replace(uninflected, "")
.replace(':', "");
let inflection = PARENS_RE.replace_all(&cleaned, "").trim().to_string();
if !inflection.is_empty() {
inflections.insert(inflection);
}
}
for inflection in inflections {
irs.insert_form(
uninflected,
&inflected,
&entry.pos,
FormSource::Inflection,
vec![inflection],
);
}
}
fn handle_alt_of_sense(entry: &WordEntry, sense: &Sense, irs: &mut Tidy) {
for alt_form in &sense.alt_of {
if sense.tags.iter().any(|tag| {
tag == "misspelling"
|| tag == "misconstruction"
|| tag == "nonstandard"
|| tag == "pronunciation-spelling"
|| tag == "obsolete"
|| tag == "abbreviation"
}) {
continue;
}
let mut sense_tags = sense.tags.clone();
if !sense_tags.iter().any(|tag| tag == "alt-of") {
sense_tags.push("alt-of".to_string());
}
irs.insert_form(
&entry.word,
&alt_form.word,
&entry.pos,
FormSource::AltOfSense,
sense_tags.clone(),
);
}
}
pub(crate) fn normalize_orthography(source: Lang, word: &str) -> String {
const ARABIC_DIACRITICS: [char; 16] = [
'\u{0618}', '\u{0619}', '\u{061A}', '\u{064B}', '\u{064C}', '\u{064D}', '\u{064E}',
'\u{064F}', '\u{0650}', '\u{0651}', '\u{0652}', '\u{0653}', '\u{0654}', '\u{0655}',
'\u{0656}', '\u{0670}',
];
match source {
Lang::Ar | Lang::Fa => word
.chars()
.filter(|c| !ARABIC_DIACRITICS.contains(c))
.collect(),
Lang::La | Lang::Ang | Lang::Sga | Lang::Grc | Lang::Ro | Lang::Id => word
.nfd()
.filter(|c| !('\u{0300}'..='\u{036F}').contains(c))
.nfc()
.collect(),
Lang::Tl => word
.nfd()
.filter(|c| !('\u{0300}'..='\u{036F}').contains(c) && *c != '-' && *c != '\'')
.nfc()
.collect(),
Lang::Sh => {
let mut last_base: Option<char> = None;
let filtered = word.nfd().filter(|&c| {
if ('\u{0300}'..='\u{036F}').contains(&c) {
!matches!(
last_base,
Some('a' | 'e' | 'i' | 'o' | 'u' | 'r' | 'A' | 'E' | 'I' | 'O' | 'U' | 'R')
)
} else {
last_base = Some(c);
true
}
});
filtered.nfc().collect()
}
Lang::Uk | Lang::Ru => word.replace('\u{0301}', ""),
_ => word.to_string(),
}
}