mod flags;
mod meta;
mod parse;
mod rule;
mod rules_apply;
mod rules_reverse;
use std::collections::BTreeMap;
use std::fmt;
use std::sync::Arc;
use hashbrown::{HashMap, HashSet};
use stringmetrics::try_levenshtein;
use unicode_segmentation::UnicodeSegmentation;
use xxhash_rust::xxh32::xxh32;
pub use self::flags::{Flag, FlagValue};
use self::meta::{Meta, PersonalMeta, Source};
pub use self::parse::DictEntry;
use self::parse::PersonalEntry;
pub use self::rule::AfxRule;
use self::rules_apply::{create_affixed_word_map, word_splitter};
use crate::affix::{CompiledFlags, FlagType};
use crate::error::{BuildError, Error};
use crate::helpers::StrWrapper;
use crate::morph::MorphInfo;
use crate::ParsedCfg;
#[must_use]
#[derive(Clone, Debug, PartialEq)]
pub struct Dictionary {
wordlist: WordList,
wordlist_nosuggest: WordList,
wordlist_forbidden: WordList,
stems: HashSet<Arc<str>>,
affix_flags: BTreeMap<Flag, FlagValue>,
morphs: HashSet<Arc<MorphInfo>>,
flag_type: FlagType,
parsed_config: Box<ParsedCfg>,
}
impl Dictionary {
#[inline]
fn new(cfg: ParsedCfg) -> Result<Self, Error> {
let CompiledFlags {
affix_flags,
rule_flags: _,
} = cfg.compile_flags()?;
Ok(Self {
wordlist: WordList::new(),
wordlist_nosuggest: WordList::new(),
wordlist_forbidden: WordList::new(),
stems: HashSet::new(),
morphs: HashSet::new(),
affix_flags,
flag_type: cfg.flag_type(),
parsed_config: Box::new(cfg),
})
}
#[inline]
pub fn check(&self, input: &str) -> bool {
input.unicode_words().all(|w| self.check_word(w))
}
#[inline]
pub fn check_word(&self, word: &str) -> bool {
let lower = word.to_lowercase();
(!self.wordlist_forbidden.0.contains_key(word))
&& (self.wordlist.0.contains_key(word)
|| self.wordlist.0.contains_key(lower.as_str())
|| self.wordlist_nosuggest.0.contains_key(word)
|| self.wordlist_nosuggest.0.contains_key(lower.as_str()))
}
#[inline]
pub fn check_indices<'a: 'd, 'd>(
&'d self,
input: &'a str,
) -> impl Iterator<Item = (usize, &'a str)> + 'd {
word_splitter(input).filter(|(_idx, w)| !self.check_word(w))
}
fn locate_word_inner<'d, 's>(&'d self, word: &'s str, index: usize) -> WordEntry<'d, 's> {
let lower = word.to_lowercase();
let ctx = if self.wordlist_forbidden.0.contains_key(word)
|| self.wordlist_forbidden.0.contains_key(lower.as_str())
{
WordCtx::Incorrect { forbidden: true }
} else if let Some((matched, meta)) = self.wordlist.0.get_key_value(word) {
WordCtx::Correct {
matched,
meta_list: meta,
}
} else if let Some((matched, meta)) = self.wordlist.0.get_key_value(lower.as_str()) {
WordCtx::Correct {
matched,
meta_list: meta,
}
} else if let Some((matched, meta)) = self.wordlist_nosuggest.0.get_key_value(word) {
WordCtx::Correct {
matched,
meta_list: meta,
}
} else if let Some((matched, meta)) =
self.wordlist_nosuggest.0.get_key_value(lower.as_str())
{
WordCtx::Correct {
matched,
meta_list: meta,
}
} else {
WordCtx::Incorrect { forbidden: false }
};
WordEntry {
word,
index,
dict: self,
context: ctx,
}
}
#[inline]
pub fn entries<'d, 's>(&'d self, input: &'s str) -> impl Iterator<Item = WordEntry<'d, 's>> {
word_splitter(input).map(|(idx, word)| self.locate_word_inner(word, idx))
}
#[inline]
pub fn entry<'d, 's>(&'d self, word: &'s str) -> WordEntry<'d, 's> {
self.locate_word_inner(word, 0)
}
#[inline]
#[doc(hidden)]
pub fn wordlist(&self) -> &WordList {
&self.wordlist
}
#[inline]
#[doc(hidden)]
pub fn wordlist_nosuggest(&self) -> &WordList {
&self.wordlist_nosuggest
}
#[inline]
#[doc(hidden)]
pub fn wordlist_forbidden(&self) -> &WordList {
&self.wordlist_forbidden
}
}
impl Dictionary {
fn create_affixed_words(&mut self, stem: &str, flags: &[Flag], morph: &[Arc<MorphInfo>]) {
let mut prefix_rules = Vec::new();
let mut suffix_rules = Vec::new();
let stem: &Arc<str> = self
.stems
.get_or_insert_with(&StrWrapper::new(stem), |s: &StrWrapper| Arc::from(s.0));
let mut add_stem = true;
let mut forbid = false;
let mut nosuggest = false;
for flag in flags {
if !self.affix_flags.contains_key(flag) {
continue;
}
match self.affix_flags.get(flag).unwrap() {
FlagValue::ForbiddenWord => forbid = true,
FlagValue::NoSuggest => nosuggest = true,
FlagValue::Rule(rule) => {
if rule.is_pfx() {
prefix_rules.push(rule);
} else {
suffix_rules.push(rule);
}
}
FlagValue::AfxNeeded => add_stem = false,
_ => {
}
}
}
let dest = if forbid {
&mut self.wordlist_forbidden
} else if nosuggest {
&mut self.wordlist_nosuggest
} else {
&mut self.wordlist
};
let dict_meta = if add_stem {
#[cfg(not(box_from_slice_has_clone_bound))]
let morph = morph.to_owned(); let meta = Meta::new(stem.clone(), Source::Dict(morph.into()));
let meta_vec = dest.0.entry_ref(stem.as_ref()).or_insert_with(Vec::new);
meta_vec.push(Meta::clone(&meta));
Some(meta)
} else {
None
};
create_affixed_word_map(stem, &prefix_rules, &suffix_rules, dict_meta.as_ref(), dest);
prefix_rules.clear();
suffix_rules.clear();
}
fn parse_update_wordlist(&mut self, source: &str) -> Result<(), Error> {
let entries = DictEntry::parse_all(source, self.flag_type)?;
self.update_wordlist(&entries);
Ok(())
}
fn update_wordlist(&mut self, entries: &[DictEntry]) {
self.wordlist.0.reserve(entries.len() * 5);
for entry in entries {
let DictEntry { stem, flags, morph } = entry;
self.create_affixed_words(stem, flags, morph);
}
}
#[allow(clippy::unnecessary_wraps)] fn parse_update_personal(&mut self, source: &str, dict: &[DictEntry]) -> Result<(), Error> {
let entries = PersonalEntry::parse_all(source);
self.update_personal(entries, dict);
Ok(())
}
fn update_personal(&mut self, entries: Vec<PersonalEntry>, _dict: &[DictEntry]) {
self.wordlist.0.reserve(entries.len() * 2);
for entry in entries {
if let Some(_friend) = &entry.friend {
} else {
let stem_arc: Arc<str> = self.stems.get_or_insert(entry.stem).clone();
let meta = PersonalMeta::new(None, self.get_or_insert_morphs(&entry.morph));
let source = Source::Personal(Arc::new(meta));
let meta = Meta::new(Arc::clone(&stem_arc), source);
let hmap = if entry.forbid {
&mut self.wordlist_forbidden.0
} else {
&mut self.wordlist.0
};
let extra_vec: &mut Vec<Meta> = hmap
.entry_ref(stem_arc.as_ref())
.or_insert_with(|| Vec::with_capacity(1));
extra_vec.push(meta);
}
}
}
fn get_or_insert_morphs(&mut self, morphs: &[MorphInfo]) -> Vec<Arc<MorphInfo>> {
let mut ret: Vec<Arc<MorphInfo>> = Vec::with_capacity(morphs.len());
for morph in morphs {
ret.push(
self.morphs
.get_or_insert_with(morph, |m| Arc::new(m.clone()))
.clone(),
);
}
ret
}
fn shrink_storage(&mut self) {
self.wordlist.0.shrink_to_fit();
self.wordlist_nosuggest.0.shrink_to_fit();
self.wordlist_forbidden.0.shrink_to_fit();
self.stems.shrink_to_fit();
self.morphs.shrink_to_fit();
}
}
#[derive(Clone)]
pub struct WordEntry<'dict, 'word> {
word: &'word str,
index: usize,
dict: &'dict Dictionary,
context: WordCtx<'dict>,
}
impl<'dict, 'word> fmt::Debug for WordEntry<'dict, 'word> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("WordEntry")
.field("word", &self.word)
.field("index", &self.index)
.field("context", &self.context)
.finish()
}
}
#[derive(Clone, Debug)]
enum WordCtx<'dict> {
Correct {
matched: &'dict str,
meta_list: &'dict [Meta],
},
Incorrect {
forbidden: bool,
},
}
impl<'dict, 'word> WordEntry<'dict, 'word> {
#[inline]
pub fn correct(&self) -> bool {
matches!(self.context, WordCtx::Correct { .. })
}
#[inline]
pub fn word(&self) -> &str {
self.word
}
#[inline]
pub fn index(&self) -> usize {
self.index
}
#[inline]
pub fn dict(&self) -> &Dictionary {
self.dict
}
#[inline]
pub fn matched_entry(&self) -> Option<&str> {
match self.context {
WordCtx::Correct { matched, .. } => Some(matched),
WordCtx::Incorrect { .. } => None,
}
}
#[inline]
pub fn forbidden(&self) -> bool {
matches!(self.context, WordCtx::Incorrect { forbidden: true })
}
#[inline]
pub fn stems(&self) -> Option<impl Iterator<Item = &str>> {
let mut visited: Vec<u32> = Vec::new();
let WordCtx::Correct {
matched: _,
meta_list,
} = self.context
else {
return None;
};
let ret = meta_list.iter().flat_map(|meta| {
let stem = std::iter::once(meta.stem());
let morph_stems = meta.source().morphs().filter_map(|morph| match morph {
MorphInfo::Stem(v) => Some(v.as_ref()),
_ => None,
});
stem.chain(morph_stems)
});
let ret = ret.filter(move |value| {
let hash = xxh32(value.as_bytes(), 0);
if visited.contains(&hash) {
false
} else {
visited.push(hash);
true
}
});
Some(ret)
}
#[inline]
pub fn analyze(&self) -> Option<impl Iterator<Item = &MorphInfo>> {
let WordCtx::Correct { meta_list, .. } = self.context else {
return None;
};
let ret = meta_list.iter().flat_map(|meta| meta.source().morphs());
Some(ret)
}
#[inline]
#[cfg(feature = "unstable-suggestions")]
pub fn suggest(&self) -> Option<Vec<&str>> {
if self.correct() {
return None;
};
let mut suggestions: Vec<(u32, &str)> = self
.dict
.wordlist
.0
.keys()
.filter_map(|key| try_levenshtein(key, self.word, 1).map(|lim| (lim, key.as_ref())))
.collect();
suggestions.sort_unstable_by_key(|(k, _v)| *k);
Some(suggestions.iter().take(10).map(|(_k, v)| *v).collect())
}
}
#[doc(hidden)]
#[derive(Clone, Debug, PartialEq)]
pub struct WordList(HashMap<Box<str>, Vec<Meta>>);
impl WordList {
fn new() -> Self {
Self(HashMap::new())
}
#[inline]
#[cfg_attr(feature = "zspell-unstable", visibility::make(pub))]
pub(crate) fn inner(&self) -> &HashMap<Box<str>, Vec<Meta>> {
&self.0
}
}
#[must_use]
#[derive(Clone, Debug, PartialEq)]
pub struct DictBuilder<'a> {
cfg: Option<ParsedCfg>,
cfg_src: Option<&'a str>,
dict_src: Option<&'a str>,
personal_src: Option<&'a str>,
}
impl<'a> DictBuilder<'a> {
#[inline]
pub fn new() -> Self {
Self {
cfg: None,
cfg_src: None,
dict_src: None,
personal_src: None,
}
}
#[inline]
pub fn config_str(mut self, config: &'a str) -> Self {
self.cfg_src = Some(config);
self
}
#[inline]
#[cfg_attr(feature = "zspell-unstable", visibility::make(pub))]
fn config(mut self, cfg: ParsedCfg) -> Self {
self.cfg = Some(cfg);
self
}
#[inline]
pub fn dict_str(mut self, dict: &'a str) -> Self {
self.dict_src = Some(dict);
self
}
#[inline]
pub fn personal_str(mut self, personal: &'a str) -> Self {
self.personal_src = Some(personal);
self
}
#[inline]
pub fn build(self) -> Result<Dictionary, Error> {
if self.cfg.is_some() && self.cfg_src.is_some() {
return Err(Error::Build(BuildError::BuilderCfgSpecTwice));
}
let cfg = if let Some(c) = self.cfg {
c
} else if let Some(cs) = self.cfg_src {
ParsedCfg::load_from_str(cs)?
} else {
return Err(Error::Build(BuildError::BuilderCfgUnspecified));
};
let mut dict = Dictionary::new(cfg)?;
if let Some(wl) = self.dict_src {
dict.parse_update_wordlist(wl)?;
}
if let Some(wl) = self.personal_src {
dict.parse_update_personal(wl, &[])?;
}
dict.shrink_storage();
Ok(dict)
}
}
impl<'a> Default for DictBuilder<'a> {
#[inline]
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests;