#![no_std]
extern crate alloc;
pub(crate) mod aff;
pub(crate) mod checker;
mod hash_bag;
mod suggester;
mod umbra_slice;
pub use aff::parser::{
ParseDictionaryError, ParseDictionaryErrorKind, ParseDictionaryErrorSource, ParseFlagError,
};
pub use checker::Checker;
pub use suggester::Suggester;
use crate::alloc::{borrow::Cow, slice, string::String, vec::Vec};
use aff::AffData;
use core::{cmp::Ordering, fmt, hash::BuildHasher};
use hash_bag::HashBag;
#[cfg(feature = "default-hasher")]
pub type DefaultHashBuilder = foldhash::fast::RandomState;
#[cfg(not(feature = "default-hasher"))]
pub enum DefaultHashBuilder {}
type Stem = umbra_slice::UmbraString;
type WordList<S> = HashBag<Stem, FlagSet, S>;
#[derive(Clone)]
pub struct Dictionary<S = DefaultHashBuilder> {
words: WordList<S>,
aff_data: AffData,
}
#[cfg(feature = "default-hasher")]
impl Dictionary<DefaultHashBuilder> {
pub fn new(aff: &str, dic: &str) -> Result<Self, ParseDictionaryError> {
Self::new_with_hasher(aff, dic, DefaultHashBuilder::default())
}
}
impl<S: BuildHasher + Clone> Dictionary<S> {
pub fn new_with_hasher(
aff: &str,
dic: &str,
build_hasher: S,
) -> Result<Self, ParseDictionaryError> {
let (words, aff_data) = aff::parser::parse(aff, dic, build_hasher)?;
Ok(Self { words, aff_data })
}
}
impl<S: BuildHasher> Dictionary<S> {
pub fn check(&self, word: &str) -> bool {
self.checker().check(word)
}
pub fn checker(&self) -> Checker<S> {
Checker::new(self)
}
pub fn suggest(&self, word: &str, out: &mut Vec<String>) {
self.suggester().suggest(word, out)
}
pub fn suggester(&self) -> Suggester<S> {
self.checker().into_suggester()
}
pub fn add(&mut self, input: &str) -> Result<(), ParseFlagError> {
let (word, flagset) = aff::parser::parse_dic_line(
input,
self.aff_data.flag_type,
&self.aff_data.flag_aliases,
&self.aff_data.ignore_chars,
)?;
self.words.insert(word, flagset);
Ok(())
}
pub fn remove_stem(&mut self, word: &str) -> bool {
let mut did_remove = false;
for flags in self.words.get_mut(word) {
if !flags.contains(&self.aff_data.options.forbidden_word_flag) {
did_remove = true;
*flags = flags.with_flag(self.aff_data.options.forbidden_word_flag);
}
}
did_remove
}
}
impl fmt::Debug for Dictionary {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Dictionary")
.field("words", &self.words.len())
.finish_non_exhaustive()
}
}
type Flag = core::num::NonZeroU16;
#[derive(Default, PartialEq, Eq, Clone)]
struct FlagSet(umbra_slice::FlagSlice);
impl From<Vec<Flag>> for FlagSet {
fn from(mut flags: Vec<Flag>) -> Self {
flags.sort_unstable();
flags.dedup();
assert!(flags.len() <= u16::MAX as usize);
Self(umbra_slice::UmbraSlice::try_from(flags.as_slice()).unwrap())
}
}
impl FlagSet {
#[inline]
pub fn as_slice(&self) -> &[Flag] {
self.0.as_slice()
}
#[inline]
pub fn iter(&self) -> slice::Iter<'_, Flag> {
self.as_slice().iter()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn has_intersection(&self, other: &Self) -> bool {
let mut xs = self.iter().peekable();
let mut ys = other.iter().peekable();
loop {
match xs.peek().zip(ys.peek()) {
Some((x, y)) => match x.cmp(y) {
Ordering::Equal => return true,
Ordering::Greater => {
ys.next();
}
Ordering::Less => {
xs.next();
}
},
_ => return false,
}
}
}
#[inline]
pub fn contains(&self, flag: &Flag) -> bool {
self.0.sorted_contains(flag)
}
pub fn with_flag(&self, flag: Flag) -> Self {
let mut flagset = Vec::from(self.0.as_slice());
flagset.push(flag);
flagset.into()
}
}
impl fmt::Debug for FlagSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_fmt(format_args!("flagset!{:?}", self.0.as_slice()))
}
}
type AffixingMode = u8;
const FULL_WORD: AffixingMode = 0;
const AT_COMPOUND_BEGIN: AffixingMode = 1;
const AT_COMPOUND_MIDDLE: AffixingMode = 2;
const AT_COMPOUND_END: AffixingMode = 3;
pub const MAX_WORD_LEN: usize = 360;
#[derive(Debug, Clone, Copy)]
enum Casing {
None,
Init,
All,
Camel,
Pascal,
}
fn classify_casing(word: &str) -> Casing {
let mut upper = 0;
let mut lower = 0;
for ch in word.chars() {
if ch.is_uppercase() {
upper += 1;
}
if ch.is_lowercase() {
lower += 1;
}
}
if upper == 0 {
return Casing::None;
}
let first_capital = word.chars().next().unwrap().is_uppercase();
if first_capital && upper == 1 {
Casing::Init
} else if lower == 0 {
Casing::All
} else if first_capital {
Casing::Pascal
} else {
Casing::Camel
}
}
fn erase_chars<'a>(word: &'a str, ignore: &[char]) -> Cow<'a, str> {
if ignore.is_empty() {
Cow::Borrowed(word)
} else {
Cow::Owned(
word.chars()
.filter(|ch| !ignore.contains(ch))
.collect::<String>(),
)
}
}
#[cfg(test)]
const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff");
#[cfg(test)]
const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic");
#[cfg(test)]
static EN_US: once_cell::sync::Lazy<Dictionary> =
once_cell::sync::Lazy::new(|| Dictionary::new(EN_US_AFF, EN_US_DIC).unwrap());
#[cfg(test)]
mod test {
use super::*;
macro_rules! flag {
( $x:expr ) => {{
Flag::new($x as u16).unwrap()
}};
}
macro_rules! flagset {
() => {{
FlagSet::default()
}};
( $( $x:expr ),* ) => {
{
FlagSet::from( $crate::alloc::vec![ $( Flag::new( $x as u16 ).unwrap() ),* ] )
}
};
}
#[test]
fn flagset_display() {
assert_eq!("flagset![1]", &alloc::format!("{:?}", flagset![1]));
}
#[test]
fn flagset_from_iter() {
assert_eq!(
&[flag!(1), flag!(2), flag!(3)],
flagset![1, 3, 2, 1].as_slice()
)
}
#[test]
fn flagset_has_intersection() {
assert!(flagset![1, 2, 3].has_intersection(&flagset![1]));
assert!(flagset![1, 2, 3].has_intersection(&flagset![2]));
assert!(flagset![1, 2, 3].has_intersection(&flagset![3]));
assert!(flagset![2].has_intersection(&flagset![1, 2, 3]));
assert!(!flagset![1, 2, 3].has_intersection(&flagset![4, 5, 6]));
assert!(!flagset![4, 5, 6].has_intersection(&flagset![1, 2, 3]));
assert!(!flagset![1, 3, 5].has_intersection(&flagset![2, 4, 6]));
assert!(!flagset![].has_intersection(&flagset![]));
}
#[test]
fn flagset_contains() {
assert!(flagset![1, 2, 3].contains(&flag!(1)));
assert!(flagset![1, 2, 3].contains(&flag!(2)));
assert!(flagset![1, 2, 3].contains(&flag!(3)));
assert!(!flagset![1, 2, 3].contains(&flag!(4)));
}
#[test]
fn classify_casing_nuspell_unit_test() {
assert!(matches!(classify_casing(""), Casing::None));
assert!(matches!(classify_casing("здраво"), Casing::None));
assert!(matches!(classify_casing("Здраво"), Casing::Init));
assert!(matches!(classify_casing("ЗДРАВО"), Casing::All));
assert!(matches!(classify_casing("здРаВо"), Casing::Camel));
assert!(matches!(classify_casing("ЗдрАво"), Casing::Pascal));
}
#[test]
fn erase_chars_test() {
fn erase_chars(word: &str, ignore: &[char]) -> String {
super::erase_chars(word, ignore).into_owned()
}
assert_eq!(
erase_chars("example", &['a', 'e', 'i', 'o', 'u']),
String::from("xmpl")
);
}
#[test]
fn new_on_bad_dictionary() {
let aff = r#"
FLAG num
"#;
let dic = r#"1
hello/world
"#;
assert!(Dictionary::new(aff, dic).is_err());
}
#[test]
fn add_word() {
let mut dict = Dictionary::new(EN_US_AFF, EN_US_DIC).unwrap();
assert!(!dict.check("foobarbaz"));
dict.add("foobarbaz/G").unwrap();
assert!(dict.check("foobarbaz"));
assert!(dict.check("foobarbazing"));
}
#[test]
fn clone() {
let aff = r#"
"#;
let dic = r#"2
hello
world
"#;
let mut dict = Dictionary::new(aff, dic).unwrap();
let copy = dict.clone();
dict.add("foo").unwrap();
assert!(dict.check("foo"));
assert!(!copy.check("foo"));
}
#[test]
fn debug() {
let aff = r#"
"#;
let dic = r#"2
hello
world
"#;
let dict = Dictionary::new(aff, dic).unwrap();
assert_eq!(&alloc::format!("{dict:?}"), "Dictionary { words: 2, .. }");
}
}