use std::{fs::File, io::BufWriter, path::PathBuf, sync::LazyLock};
use anyhow::Result;
use indexmap::map::Entry;
use regex::Regex;
use serde::{Deserialize, Serialize};
use unicode_normalization::UnicodeNormalization;
mod heap;
use heap::HeapSize;
mod preprocess_forms;
use preprocess_forms::preprocess_forms;
mod postprocess;
pub use postprocess::postprocess_main;
use crate::{
Map, Set,
cli::{LangSpecs, Options},
dict::Intermediate,
lang::{Edition, Lang},
models::kaikki::{Example, Form, HeadTemplate, Sense, Synonym, Tag, WordEntry},
path::PathManager,
tags::{Pos, REDUNDANT_FORM_TAGS},
utils::{human_size, link_kaikki, link_wiktionary},
};
const MAX_NUMBER_OF_SYNONYMS: usize = 3;
const MAX_NUMBER_OF_EXAMPLES: usize = 3;
const MAX_SIZE_OF_EXAMPLE: usize = 120;
const MAX_SIZE_OF_EXAMPLE_REFERENCE: usize = 120;
#[derive(Debug, Default)]
pub struct Tidy {
pub lemma_map: LemmaMap, pub form_map: FormMap, }
impl Intermediate for Tidy {
fn len(&self) -> usize {
self.len()
}
fn write(&self, pm: &PathManager) -> Result<PathBuf> {
self.write(pm)
}
}
impl Tidy {
fn len(&self) -> usize {
self.lemma_map.len() + self.form_map.len()
}
fn insert_lemma(&mut self, lemma: &str, reading: &str, pos: &str, entry: LemmaInfo) {
self.lemma_map.insert(lemma, reading, pos, entry);
}
fn insert_form(
&mut self,
uninflected: &str,
inflected: &str,
pos: &str,
source: FormSource,
tags: Vec<Tag>,
) {
self.form_map
.insert(uninflected, inflected, pos, source, tags);
}
#[tracing::instrument(skip_all)]
fn write(&self, pm: &PathManager) -> Result<PathBuf> {
let dir_tidy = pm.dir_tidy();
_ = std::fs::create_dir_all(&dir_tidy);
if !self.lemma_map.0.is_empty() {
self.write_json(pm.path_lemmas(), &self.lemma_map, pm.opts.pretty)?;
}
if !self.form_map.0.is_empty() {
self.write_json(pm.path_forms(), &self.form_map, pm.opts.pretty)?;
}
Ok(dir_tidy)
}
fn write_json<T: Serialize>(&self, path: PathBuf, data: &T, pretty: bool) -> Result<()> {
let writer = BufWriter::new(File::create(&path)?);
if pretty {
serde_json::to_writer_pretty(writer, data)?;
} else {
serde_json::to_writer(writer, data)?;
}
Ok(())
}
}
pub fn found_ir_message_impl(langs: LangSpecs, irs: &Tidy) {
let n_lemmas = irs.lemma_map.len();
let n_forms = irs.form_map.len();
let n_irs = n_lemmas + n_forms;
let n_forms_inflection = irs.form_map.len_inflection();
let n_forms_extracted = irs.form_map.len_extracted();
let n_forms_alt_of = irs.form_map.len_alt_of();
let n_forms_postprocessed = irs.form_map.len_postprocessed();
debug_assert_eq!(
n_forms,
n_forms_inflection + n_forms_extracted + n_forms_alt_of + n_forms_postprocessed,
"mismatch in form counts"
);
let lemma_heap = irs.lemma_map.heap_size() as f64;
let form_heap = irs.form_map.heap_size() as f64;
let irs_heap = lemma_heap + form_heap;
let lemma_heap_msg = human_size(lemma_heap);
let form_heap_msg = human_size(form_heap);
let irs_heap_msg = human_size(irs_heap);
const MB: f64 = 1024.0 * 1024.0;
if irs_heap > 500.0 * MB {
tracing::debug!(
"[{}-{}] Found {} irs ({})",
langs.source,
langs.target,
n_irs,
irs_heap_msg,
);
tracing::debug!("├─ terms: {} ({})", n_lemmas, lemma_heap_msg,);
tracing::debug!(
"└─ forms: {} ({}) [infl {}, extr {}, alt {}]",
n_forms,
form_heap_msg,
n_forms_inflection,
n_forms_extracted,
n_forms_alt_of,
);
} else {
tracing::debug!(
"Found {n_irs} irs: {n_lemmas} terms, {n_forms} forms \
[{n_forms_inflection} infl, {n_forms_extracted} extr, {n_forms_alt_of} alt]"
);
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
struct LemmaKey {
lemma: String,
reading: String,
pos: Pos,
}
impl LemmaKey {
fn new(lemma: &str, reading: &str, pos: Pos) -> Self {
Self {
lemma: lemma.into(),
reading: reading.into(),
pos,
}
}
fn unpack(&self) -> (&str, &str, Pos) {
(&self.lemma, &self.reading, self.pos)
}
}
#[derive(Debug, Default)]
pub struct LemmaMap(Map<LemmaKey, Vec<LemmaInfo>>);
impl LemmaMap {
fn insert(&mut self, lemma: &str, reading: &str, pos: &str, entry: LemmaInfo) {
debug_assert!(!entry.gloss_tree.is_empty());
let key = LemmaKey::new(lemma, reading, Pos::from(pos));
match self.0.entry(key) {
Entry::Vacant(e) => {
e.insert(vec![entry]);
}
Entry::Occupied(mut e) => {
e.get_mut().push(entry);
}
}
}
}
impl Serialize for LemmaMap {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let mut nested: Map<&str, Map<&str, Map<Pos, &Vec<LemmaInfo>>>> = Map::default();
for (key, infos) in &self.0 {
let (lemma, reading, pos) = key.unpack();
nested
.entry(lemma)
.or_default()
.entry(reading)
.or_default()
.insert(pos, infos);
}
nested.serialize(serializer)
}
}
impl LemmaMap {
pub fn flat_iter(&self) -> impl Iterator<Item = (&str, &str, Pos, &LemmaInfo)> {
self.0.iter().flat_map(|(key, infos)| {
infos.iter().map(move |info| {
let (lemma, reading, pos) = key.unpack();
(lemma, reading, pos, info)
})
})
}
fn len(&self) -> usize {
self.0.values().map(Vec::len).sum()
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct FormKey {
uninflected: String,
inflected: String,
pos: Pos,
}
impl FormKey {
fn new(uninflected: &str, inflected: &str, pos: Pos) -> Self {
Self {
uninflected: uninflected.into(),
inflected: inflected.into(),
pos,
}
}
const fn unpack(&self) -> (&str, &str, Pos) {
(self.uninflected.as_str(), self.inflected.as_str(), self.pos)
}
}
#[derive(Debug, Default)]
pub struct FormMap(Map<FormKey, (FormSource, Vec<String>)>);
impl FormMap {
fn insert(
&mut self,
uninflected: &str,
inflected: &str,
pos: &str,
source: FormSource,
tags: Vec<Tag>,
) {
if tags.is_empty()
|| uninflected.is_empty()
|| inflected.is_empty()
|| uninflected == inflected
{
return;
}
let key = FormKey::new(uninflected, inflected, Pos::from(pos));
match self.0.entry(key) {
Entry::Vacant(e) => {
e.insert((source, tags));
}
Entry::Occupied(mut e) => {
e.get_mut().1.extend(tags);
}
}
}
}
impl Serialize for FormMap {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
#[expect(clippy::type_complexity)]
let mut nested: Map<&str, Map<&str, Map<Pos, &(FormSource, Vec<String>)>>> = Map::default();
for (key, infos) in &self.0 {
let (uninflected, inflected, pos) = key.unpack();
nested
.entry(uninflected)
.or_default()
.entry(inflected)
.or_default()
.insert(pos, infos);
}
nested.serialize(serializer)
}
}
impl FormMap {
pub fn flat_iter(&self) -> impl Iterator<Item = (&str, &str, Pos, &FormSource, &Vec<String>)> {
self.0.iter().map(|(key, (source, tags))| {
let (uninflected, inflected, pos) = key.unpack();
(uninflected, inflected, pos, source, tags)
})
}
pub fn flat_iter_mut(
&mut self,
) -> impl Iterator<Item = (&str, &str, Pos, &mut FormSource, &mut Vec<String>)> {
self.0.iter_mut().map(|(key, (source, tags))| {
let (uninflected, inflected, pos) = key.unpack();
(uninflected, inflected, pos, source, tags)
})
}
fn len(&self) -> usize {
self.flat_iter().count()
}
fn len_of(&self, source: FormSource) -> usize {
self.flat_iter()
.filter(|(_, _, _, src, _)| **src == source)
.count()
}
fn len_extracted(&self) -> usize {
self.len_of(FormSource::Extracted)
}
fn len_inflection(&self) -> usize {
self.len_of(FormSource::Inflection)
}
fn len_alt_of(&self) -> usize {
self.len_of(FormSource::AltOfTop) + self.len_of(FormSource::AltOfSense)
}
fn len_postprocessed(&self) -> usize {
self.len_of(FormSource::PostProcessed)
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum FormSource {
Extracted,
Inflection,
AltOfTop,
AltOfSense,
PostProcessed,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct LemmaInfo {
pub gloss_tree: GlossTree,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<Tag>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub synonyms: Vec<Synonym>,
#[serde(skip_serializing_if = "Option::is_none")]
pub etymology_text: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub head_info_text: Option<String>,
#[serde(rename = "wlink")]
pub link_wiktionary: String,
#[serde(rename = "klink")]
pub link_kaikki: String,
}
pub type GlossTree = Map<String, GlossInfo>;
#[derive(Debug, Serialize, Deserialize, Default, Clone)]
#[serde(default)]
pub struct GlossInfo {
#[serde(skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<Tag>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub topics: Vec<Tag>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub examples: Vec<Example>,
#[serde(skip_serializing_if = "Option::is_none")]
pub children: Option<Box<GlossTree>>,
}
pub fn process_main(edition: Edition, source: Lang, entry: &WordEntry, irs: &mut Tidy) {
process_forms(edition, source, entry, irs);
process_alt_forms(entry, irs);
let reading = match get_reading(edition, source, entry) {
Some(reading) if reading != entry.word => reading,
_ => String::new(),
};
if entry.contains_no_gloss() {
process_no_gloss(edition, entry, irs);
} else {
irs.insert_lemma(
&entry.word,
&reading,
&entry.pos,
process_entry(edition, source, entry),
);
}
}
pub fn should_skip_entry(entry: &WordEntry) -> bool {
entry.pos == "romanization"
}
pub fn preprocess_main(
edition: Edition,
source: Lang,
opts: &Options,
entry: &mut WordEntry,
irs: &mut Tidy,
) {
preprocess_forms(edition, source, entry);
match edition {
Edition::En => {
if let Some(cform) = entry.canonical_form() {
let cform_tags: Vec<_> = cform.tags.clone();
for sense in &mut entry.senses {
for tag in &cform_tags {
if tag != "canonical" && !sense.tags.contains(tag) {
sense.tags.push(tag.into());
}
}
}
}
}
Edition::El => {
let gender_tags = ["masculine", "feminine", "neuter"];
for form in &entry.forms {
if form.form == entry.word {
for sense in &mut entry.senses {
for tag in &form.tags {
if gender_tags.contains(&tag.as_str()) && !sense.tags.contains(tag) {
sense.tags.push(tag.into());
}
}
}
}
}
}
_ => (),
}
if edition == Edition::It {
for sense in &mut entry.senses {
sense
.glosses
.retain(|gloss| *gloss != "definizione mancante; se vuoi, aggiungila tu");
}
}
let old_senses = std::mem::take(&mut entry.senses);
let mut senses_without_inflections = Vec::new();
for sense in old_senses {
if (!opts.experimental || entry.non_trivial_forms().next().is_none())
&& handle_inflection_sense(edition, source, entry, &sense, irs)
{
} else if handle_alt_of_sense(entry, &sense, irs) {
} else if handle_see_sense(edition, entry, &sense, irs) {
} else {
senses_without_inflections.push(sense);
}
}
entry.senses = senses_without_inflections;
if opts.experimental {
static TRAILING_PUNCT_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\p{P}$").unwrap());
for sense in &mut entry.senses {
for gloss in &mut sense.glosses {
if !TRAILING_PUNCT_RE.is_match(gloss) {
gloss.push(' ');
}
}
}
}
}
fn process_forms(edition: Edition, source: Lang, entry: &WordEntry, irs: &mut Tidy) {
for form in entry.non_trivial_forms() {
debug_assert_ne!(form.form, entry.word);
if should_break_at_finish_forms(edition, source, form) {
break;
}
if should_skip_form(edition, source, entry, form) {
continue;
}
let filtered_tags = form
.tags
.iter()
.map(String::as_str)
.filter(|tag| !REDUNDANT_FORM_TAGS.contains(tag))
.collect::<Vec<_>>()
.join(" ");
irs.insert_form(
&entry.word,
&form.form,
&entry.pos,
FormSource::Extracted,
vec![filtered_tags],
);
}
}
fn should_skip_form(edition: Edition, source: Lang, entry: &WordEntry, form: &Form) -> bool {
match (edition, source) {
(Edition::Fr, Lang::Fr) => {
if ["qu’", "que ", "en "]
.iter()
.any(|p| form.form.starts_with(p))
{
return true;
}
}
(Edition::En, Lang::Ja) => {
if is_japanese_romanization(&form.form) {
return true;
}
if form.form.contains('[') && form.form.contains(']') {
return true;
}
}
(Edition::En, Lang::En) => {
if form
.tags
.iter()
.any(|tag| tag == "rare" || tag == "nonstandard" || tag == "dialectal")
{
return true;
}
if form.form.contains(' ')
&& form
.tags
.iter()
.any(|tag| tag == "comparative" || tag == "superlative")
{
return true;
}
}
(Edition::En, Lang::Fi) => {
if form
.tags
.iter()
.any(|tag| tag == "rare" || tag == "possessive")
{
return true;
}
if form.form.contains(' ') {
return true;
}
}
(Edition::Ja, Lang::Ja) => {
if form.form.starts_with(&entry.word) {
return true;
}
}
_ => (),
}
false
}
fn is_japanese_romanization(form: &str) -> bool {
form.chars()
.all(|c| c.is_ascii() || matches!(c, 'ā' | 'ī' | 'ū' | 'ē' | 'ō'))
}
fn should_break_at_finish_forms(edition: Edition, source: Lang, form: &Form) -> bool {
if matches!((edition, source), (Edition::En, Lang::Fi)) {
if form.form == "See the possessive forms below." {
return true;
}
if form.form == "Rare. Only used with substantive adjectives." {
return true;
}
}
false
}
fn process_alt_forms(entry: &WordEntry, irs: &mut Tidy) {
for alt_form in &entry.alt_of {
irs.insert_form(
&entry.word,
&alt_form.word,
&entry.pos,
FormSource::AltOfTop,
vec!["alt-of".to_string()],
);
}
}
#[expect(clippy::single_match)]
fn process_no_gloss(edition: Edition, entry: &WordEntry, irs: &mut Tidy) {
match edition {
Edition::El => {
if entry.is_participle()
&& let Some(form_of) = entry.form_of.first()
{
irs.insert_form(
&form_of.word,
&entry.word,
&entry.pos,
FormSource::Inflection,
vec![format!("redirected from {}", entry.word)],
);
}
}
_ => (),
}
}
pub fn get_reading(edition: Edition, source: Lang, entry: &WordEntry) -> Option<String> {
match (edition, source) {
(Edition::En, Lang::Ja) => get_japanese_reading(entry),
(Edition::En, Lang::Fa) => entry.romanization_form().map(|f| f.form.clone()),
(Edition::Ja, _) => entry.transliteration_form().map(|f| f.form.clone()),
(Edition::En | Edition::Zh, Lang::Zh) => entry.pinyin().map(String::from),
_ => get_canonical_word(source, entry),
}
}
fn get_canonical_word(source: Lang, entry: &WordEntry) -> Option<String> {
match source {
Lang::La | Lang::Ru | Lang::Grc | Lang::Ar | Lang::Fa => {
entry.canonical_form().map(|f| f.form.clone())
}
_ => None,
}
}
fn get_japanese_reading(entry: &WordEntry) -> Option<String> {
if let Some(cform) = entry.canonical_form()
&& !cform.ruby.is_empty()
{
let mut cform_lemma = entry.word.clone();
let mut cursor = 0;
for (base, reading) in &cform.ruby {
if let Some(pos) = cform_lemma[cursor..].find(base) {
let start = cursor + pos;
let end = start + base.len();
cform_lemma.replace_range(start..end, reading);
cursor = start + reading.len();
} else {
tracing::warn!("Kanji '{}' not found in '{}'", base, cform_lemma);
return None;
}
}
return Some(cform_lemma);
}
None
}
fn process_entry(edition: Edition, source: Lang, entry: &WordEntry) -> LemmaInfo {
LemmaInfo {
gloss_tree: get_gloss_tree(entry),
tags: entry.tags.clone(),
synonyms: entry
.synonyms
.iter()
.filter(|syn| syn.word != entry.word)
.take(MAX_NUMBER_OF_SYNONYMS)
.cloned()
.collect(),
etymology_text: entry
.etymology_texts()
.filter(|texts| match edition {
Edition::Ru => !texts.contains(&"Происходит от ??"),
_ => true,
})
.map(|etymology_text| etymology_text.join("\n")),
head_info_text: get_head_info(&entry.head_templates)
.map(|head_info_text| head_info_text.join("\n")),
link_wiktionary: link_wiktionary(edition, source, &entry.word),
link_kaikki: link_kaikki(edition, source, &entry.word),
}
}
static PARENS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\(.+?\)").unwrap());
fn get_head_info(head_templates: &[HeadTemplate]) -> Option<Vec<&str>> {
let mut seen = Set::default();
let result: Vec<_> = head_templates
.iter()
.filter_map(|head_template| {
let expansion = head_template.expansion.as_str();
if PARENS_RE.is_match(expansion) && seen.insert(expansion) {
Some(expansion)
} else {
None
}
})
.collect();
if result.is_empty() {
None
} else {
Some(result)
}
}
fn get_gloss_tree(entry: &WordEntry) -> GlossTree {
let mut gloss_tree = GlossTree::default();
for sense in &entry.senses {
let mut filtered_examples: Vec<_> = sense
.examples
.iter()
.filter(|ex| !ex.text.is_empty() && ex.text.chars().count() <= MAX_SIZE_OF_EXAMPLE)
.take(MAX_NUMBER_OF_EXAMPLES)
.cloned()
.map(|mut ex| {
if ex.reference.chars().count() > MAX_SIZE_OF_EXAMPLE_REFERENCE {
ex.reference = String::new();
}
ex
})
.collect();
filtered_examples.sort_by_key(|ex| ex.translation.is_empty());
insert_glosses(
&mut gloss_tree,
&sense.glosses,
&sense.tags,
&sense.topics,
&filtered_examples,
);
}
gloss_tree
}
fn insert_glosses(
gloss_tree: &mut GlossTree,
glosses: &[String],
tags: &[Tag],
topics: &[Tag],
examples: &[Example],
) {
let Some(head) = glosses.first() else {
return;
};
let tail = &glosses[1..];
let node = gloss_tree.entry(head.clone()).or_insert_with(|| GlossInfo {
tags: tags.to_vec(),
topics: topics.to_vec(),
examples: Vec::new(),
children: None,
});
if !node.tags.is_empty() {
node.tags = tags
.iter()
.filter(|&t| node.tags.contains(t))
.cloned()
.collect();
}
if tail.is_empty() {
node.examples = examples.to_vec();
return;
}
let children = node
.children
.get_or_insert_with(|| Box::new(GlossTree::default()));
insert_glosses(children, tail, tags, topics, examples);
}
static DE_INFLECTION_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"^(.*)des (?:Verbs|Adjektivs|Substantivs|Demonstrativpronomens|Possessivpronomens|Pronomens) (.*)$"
).unwrap()
});
const TAGS_RETAINED_EL: [&str; 9] = [
"masculine",
"feminine",
"neuter",
"singular",
"plural",
"nominative",
"accusative",
"genitive",
"vocative",
];
fn handle_inflection_sense(
edition: Edition,
source: Lang,
entry: &WordEntry,
sense: &Sense,
irs: &mut Tidy,
) -> bool {
if sense.glosses.is_empty() {
return false;
}
match edition {
Edition::De => {
let Some((inflection_tags, uninflected)) = sense
.glosses
.first()
.and_then(|gloss| DE_INFLECTION_RE.captures(gloss))
.and_then(|caps| Some((caps.get(1)?, caps.get(2)?)))
else {
return false;
};
let inflection_tags = inflection_tags.as_str().trim();
irs.insert_form(
uninflected.as_str(),
&entry.word,
&entry.pos,
FormSource::Inflection,
vec![inflection_tags.to_string()],
);
true
}
Edition::El => {
if sense.form_of.is_empty() || !sense.glosses.iter().any(|g| g.contains("του")) {
return false;
}
let allowed_tags: Vec<_> = sense
.tags
.iter()
.filter(|tag| TAGS_RETAINED_EL.contains(&tag.as_str()))
.map(String::from)
.collect();
let inflection_tags: Vec<_> = if allowed_tags.is_empty() {
vec![format!("redirected from {}", entry.word)]
} else {
allowed_tags
};
for form in &sense.form_of {
irs.insert_form(
&form.word,
&entry.word,
&entry.pos,
FormSource::Inflection,
inflection_tags.clone(),
);
}
true
}
Edition::En => {
let is_inflection = sense.glosses.iter().any(|gloss| {
if gloss.contains("inflection of") {
return true;
}
for form in &sense.form_of {
if form.word.is_empty() {
continue;
}
let subs = format!("of {}", form.word);
if gloss.ends_with(&subs)
|| (gloss.contains(&format!("{subs} (")) && gloss.ends_with(')'))
{
return true;
}
}
false
});
if !is_inflection {
return false;
}
handle_inflection_sense_en(source, entry, sense, irs);
true
}
_ => {
if sense.form_of.len() != 1 {
return false;
}
let form_of = &sense.form_of[0];
debug_assert!(
sense.tags.iter().any(|tag| *tag == "form-of")
|| entry.tags.iter().any(|tag| *tag == "form-of")
);
let allowed_tags: Vec<_> = sense
.tags
.iter()
.filter(|tag| *tag != "form-of")
.map(String::from)
.collect();
let inflection_tags: Vec<_> = if allowed_tags.is_empty() {
vec![format!("redirected from {}", entry.word)]
} else {
allowed_tags
};
let norm_form_of_word = normalize_orthography(source, &form_of.word);
irs.insert_form(
&norm_form_of_word,
&entry.word,
&entry.pos,
FormSource::Inflection,
inflection_tags,
);
true
}
}
}
fn handle_inflection_sense_en(source: Lang, entry: &WordEntry, sense: &Sense, irs: &mut Tidy) {
let uninflected = match sense.form_of.as_slice() {
[alt_form] => &alt_form.word,
_ => return,
};
let inflected = get_canonical_word(source, entry).unwrap_or_else(|| entry.word.clone());
if inflected == *uninflected {
return;
}
let mut inflections = Set::default();
let of_uninflected = format!("of {uninflected}");
for gloss in &sense.glosses {
let cleaned = gloss
.replace("inflection of ", "")
.replace(&of_uninflected, "")
.replace(uninflected, "")
.replace(':', "");
let inflection = PARENS_RE.replace_all(&cleaned, "").trim().to_string();
if !inflection.is_empty() {
inflections.insert(inflection);
}
}
for inflection in inflections {
irs.insert_form(
uninflected,
&inflected,
&entry.pos,
FormSource::Inflection,
vec![inflection],
);
}
}
fn handle_alt_of_sense(entry: &WordEntry, sense: &Sense, irs: &mut Tidy) -> bool {
let mut handled = false;
for alt_form in &sense.alt_of {
handled = true;
if sense.tags.iter().any(|tag| {
tag == "misspelling"
|| tag == "misconstruction"
|| tag == "nonstandard"
|| tag == "pronunciation-spelling"
|| tag == "obsolete"
|| tag == "abbreviation"
}) {
continue;
}
let mut sense_tags = sense.tags.clone();
if !sense_tags.iter().any(|tag| tag == "alt-of") {
sense_tags.push("alt-of".to_string());
}
irs.insert_form(
&entry.word,
&alt_form.word,
&entry.pos,
FormSource::AltOfSense,
sense_tags.clone(),
);
}
handled
}
static JA_SEE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^「?([\p{Hiragana}\p{Katakana}\p{Han}ー]+?)」?\s?を?参照。?$").unwrap()
});
static JA_KANJI_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^「?([\p{Hiragana}\p{Katakana}\p{Han}ー]+?)」?\s?の漢字表記。?$").unwrap()
});
fn handle_see_sense(edition: Edition, entry: &WordEntry, sense: &Sense, irs: &mut Tidy) -> bool {
let Some(gloss) = (matches!(edition, Edition::Ja))
.then(|| sense.glosses.first())
.flatten()
else {
return false;
};
let Some(target) = JA_SEE_RE
.captures(gloss)
.or_else(|| JA_KANJI_RE.captures(gloss))
.and_then(|caps| caps.get(1))
else {
return false;
};
irs.insert_form(
target.as_str(),
&entry.word,
&entry.pos,
FormSource::Inflection,
vec![format!("redirected from {}", entry.word)],
);
true
}
pub fn normalize_orthography(source: Lang, word: &str) -> String {
const ARABIC_DIACRITICS: [char; 16] = [
'\u{0618}', '\u{0619}', '\u{061A}', '\u{064B}', '\u{064C}', '\u{064D}', '\u{064E}',
'\u{064F}', '\u{0650}', '\u{0651}', '\u{0652}', '\u{0653}', '\u{0654}', '\u{0655}',
'\u{0656}', '\u{0670}',
];
match source {
Lang::Ar | Lang::Fa => word
.chars()
.filter(|c| !ARABIC_DIACRITICS.contains(c))
.collect(),
Lang::La | Lang::Ang | Lang::Sga | Lang::Grc | Lang::Ro | Lang::Id => word
.nfd()
.filter(|c| !('\u{0300}'..='\u{036F}').contains(c))
.nfc()
.collect(),
Lang::Tl => word
.nfd()
.filter(|c| !('\u{0300}'..='\u{036F}').contains(c) && *c != '-' && *c != '\'')
.nfc()
.collect(),
Lang::Sh => {
let mut last_base: Option<char> = None;
let filtered = word.nfd().filter(|&c| {
if ('\u{0300}'..='\u{036F}').contains(&c) {
!matches!(
last_base,
Some('a' | 'e' | 'i' | 'o' | 'u' | 'r' | 'A' | 'E' | 'I' | 'O' | 'U' | 'R')
)
} else {
last_base = Some(c);
true
}
});
filtered.nfc().collect()
}
Lang::Uk | Lang::Ru => word.replace('\u{0301}', ""),
_ => word.to_string(),
}
}