1pub(crate) mod parser;
2
3use crate::{
4 alloc::{
5 borrow::Cow,
6 boxed::Box,
7 string::{String, ToString},
8 vec::Vec,
9 },
10 AffixingMode, Flag, FlagSet, AT_COMPOUND_BEGIN, AT_COMPOUND_END, FULL_WORD,
11};
12
13use core::{marker::PhantomData, num::NonZeroU16, str::Chars};
14
15pub(crate) const HIDDEN_HOMONYM_FLAG: Flag = unsafe { Flag::new_unchecked(u16::MAX) };
16pub(crate) const MAX_SUGGESTIONS: usize = 16;
17
18macro_rules! has_flag {
19 ( $flags:expr, $flag:expr ) => {{
20 match $flag {
21 Some(flag) => $flags.contains(&flag),
22 None => false,
23 }
24 }};
25}
26
27#[derive(Debug, Default, Clone, Copy)]
32pub(crate) enum FlagType {
33 #[default]
37 Short,
38 Long,
43 Numeric,
48 Utf8,
50}
51
52#[derive(Debug, PartialEq, Eq, Clone)]
53pub(crate) struct Condition {
54 pattern: Box<str>,
61 chars: usize,
66}
67
68impl Condition {
69 pub fn matches(&self, input: &str) -> bool {
70 let mut input = input.chars().peekable();
71 let mut pattern = self.pattern.chars().peekable();
72
73 loop {
74 match (pattern.next(), input.next()) {
75 (None, _) => return true,
77 (Some(_), None) => return false,
78 (Some('.'), Some(_)) => (),
80 (Some('['), Some(input_ch)) => {
82 let mut found = false;
83 let negative = pattern.next_if_eq(&'^').is_some();
84
85 for ch in pattern.by_ref() {
86 if ch == ']' {
87 break;
88 }
89
90 if ch == input_ch {
91 found = true;
92 }
93 }
94
95 if !negative && !found {
98 return false;
99 }
100 if negative && found {
103 return false;
104 }
105 }
106 (Some(pattern_ch), Some(input_ch)) => {
108 if pattern_ch != input_ch {
109 return false;
110 }
111 }
112 }
113 }
114 }
115}
116
117#[derive(Debug, PartialEq, Eq, Clone)]
119pub(crate) struct Affix<K> {
120 pub flag: Flag,
122 pub crossproduct: bool,
125 pub strip: Option<Box<str>>,
127 pub add: Box<str>,
129 condition: Option<Condition>,
135 pub flags: FlagSet,
140 phantom_data: PhantomData<K>,
141}
142
143impl<K: AffixKind> Affix<K> {
144 pub fn new(
145 flag: Flag,
146 crossproduct: bool,
147 strip: Option<&str>,
148 add: &str,
149 condition: Option<&str>,
150 flags: FlagSet,
151 ) -> Result<Self, parser::ConditionError> {
152 let condition = condition.map(str::parse).transpose()?;
153
154 Ok(Self {
155 flag,
156 crossproduct,
157 strip: strip.map(|str| str.into()),
158 add: add.into(),
159 flags,
160 condition,
161 phantom_data: PhantomData,
162 })
163 }
164
165 pub fn appending(&self) -> K::Chars<'_> {
166 K::chars(&self.add)
167 }
168
169 #[inline]
170 pub fn is_modifying(&self) -> bool {
171 self.strip.is_some() || !self.add.is_empty()
172 }
173}
174
175#[derive(Debug, PartialEq, Eq, Clone, Copy)]
176pub(crate) struct Pfx;
177#[derive(Debug, PartialEq, Eq, Clone, Copy)]
178pub(crate) struct Sfx;
179
180pub(crate) type Prefix = Affix<Pfx>;
182pub(crate) type Suffix = Affix<Sfx>;
184
185pub(crate) trait AffixKind {
193 type Chars<'a>: Iterator<Item = char>
194 where
195 Self: 'a;
196
197 fn chars(word: &str) -> Self::Chars<'_>;
198
199 fn is_valid<const MODE: AffixingMode>(affix: &Affix<Self>, options: &AffOptions) -> bool
201 where
202 Self: Sized;
203}
204
205impl AffixKind for Pfx {
206 type Chars<'a> = Chars<'a>;
207
208 fn chars(word: &str) -> Self::Chars<'_> {
209 word.chars()
210 }
211
212 fn is_valid<const MODE: AffixingMode>(prefix: &Prefix, options: &AffOptions) -> bool {
213 if MODE == FULL_WORD && has_flag!(prefix.flags, options.only_in_compound_flag) {
214 return false;
215 }
216
217 if MODE == AT_COMPOUND_END && !has_flag!(prefix.flags, options.compound_permit_flag) {
218 return false;
219 }
220
221 if MODE != FULL_WORD && has_flag!(prefix.flags, options.compound_forbid_flag) {
222 return false;
223 }
224
225 true
226 }
227}
228
229impl AffixKind for Sfx {
230 type Chars<'a> = core::iter::Rev<Chars<'a>>;
231
232 fn chars(word: &str) -> Self::Chars<'_> {
233 word.chars().rev()
234 }
235
236 fn is_valid<const MODE: AffixingMode>(suffix: &Suffix, options: &AffOptions) -> bool {
237 if MODE == FULL_WORD && has_flag!(suffix.flags, options.only_in_compound_flag) {
238 return false;
239 }
240
241 if MODE == AT_COMPOUND_BEGIN && !has_flag!(suffix.flags, options.compound_permit_flag) {
242 return false;
243 }
244
245 if MODE != FULL_WORD && has_flag!(suffix.flags, options.compound_forbid_flag) {
246 return false;
247 }
248
249 true
250 }
251}
252
253impl Prefix {
254 pub fn to_stem<'a>(&self, word: &'a str) -> Cow<'a, str> {
264 let stripped = word
265 .strip_prefix(&*self.add)
266 .expect("to_stem should only be called when the `add` is a prefix of the word");
267
268 match &self.strip {
269 Some(strip) => {
270 let mut stem = strip.to_string();
271 stem.push_str(stripped);
272 Cow::Owned(stem)
273 }
274 None => Cow::Borrowed(stripped),
275 }
276 }
277
278 pub fn to_derived(&self, word: &str) -> String {
290 let stripped = match &self.strip {
291 Some(strip) => word
292 .strip_prefix(&**strip)
293 .expect("to_derived should only be called when `strip` is a prefix of the word"),
294 None => word,
295 };
296 let mut stem = self.add.to_string();
297 stem.push_str(stripped);
298 stem
299 }
300
301 pub fn condition_matches(&self, word: &str) -> bool {
302 let condition = match self.condition.as_ref() {
303 Some(condition) => condition,
304 None => return true,
305 };
306
307 if word.len() < condition.chars {
309 return false;
310 }
311
312 condition.matches(word)
313 }
314}
315
316impl Suffix {
317 pub fn to_stem<'a>(&self, word: &'a str) -> Cow<'a, str> {
327 let stripped = word
328 .strip_suffix(&*self.add)
329 .expect("to_stem should only be called when the `add` is a suffix of the word");
330
331 match self.strip.as_deref() {
332 Some(strip) => {
333 let mut stem = stripped.to_string();
334 stem.push_str(strip);
335 Cow::Owned(stem)
336 }
337 None => Cow::Borrowed(stripped),
338 }
339 }
340
341 pub fn to_derived(&self, word: &str) -> String {
353 let mut stem = match &self.strip {
354 Some(strip) => word
355 .strip_suffix(&**strip)
356 .expect("to_derived should only be called when `strip` is a prefix of the word"),
357 None => word,
358 }
359 .to_string();
360 stem.push_str(&self.add);
361 stem
362 }
363
364 pub fn condition_matches(&self, word: &str) -> bool {
365 let condition = match self.condition.as_ref() {
366 Some(condition) => condition,
367 None => return true,
368 };
369
370 let len_bytes = word.len();
372 if len_bytes < condition.chars {
373 return false;
374 }
375
376 let (chars, bytes) = word
377 .char_indices()
378 .rev()
379 .take(condition.chars)
380 .fold((0, 0), |(chars, _bytes), (byte_index, _ch)| {
381 (chars + 1, len_bytes - byte_index)
382 });
383
384 if chars < condition.chars {
385 return false;
386 }
387 condition.matches(&word[word.len() - bytes..])
388 }
389}
390
391pub(crate) type PrefixIndex = AffixIndex<Pfx>;
392pub(crate) type SuffixIndex = AffixIndex<Sfx>;
393
394#[derive(Debug, Clone)]
440pub(crate) struct AffixIndex<C> {
441 table: Box<[Affix<C>]>,
442 first_char: Box<[char]>,
443 prefix_idx_with_first_char: Box<[usize]>,
444 pub all_flags: FlagSet,
445}
446
447impl<C: AffixKind> FromIterator<Affix<C>> for AffixIndex<C> {
448 fn from_iter<T: IntoIterator<Item = Affix<C>>>(iter: T) -> Self {
449 let table: Vec<_> = iter.into_iter().collect();
450 table.into()
451 }
452}
453
454impl<C: AffixKind> From<Vec<Affix<C>>> for AffixIndex<C> {
455 fn from(mut table: Vec<Affix<C>>) -> Self {
456 table.sort_unstable_by(|a, b| a.appending().cmp(b.appending()));
459
460 let mut first_char = Vec::new();
461 let mut prefix_idx_with_first_char = Vec::new();
462
463 let mut first_char_idx = table.partition_point(|affix| affix.appending().next().is_none());
465 while first_char_idx < table.len() {
466 let ch = table[first_char_idx]
467 .appending()
468 .next()
469 .expect("vec is sorted so empty keys are before the partition point");
470
471 first_char.push(ch);
475 prefix_idx_with_first_char.push(first_char_idx);
476
477 match table[first_char_idx..].iter().position(|affix| {
478 affix
479 .appending()
480 .next()
481 .expect("vec is sorted so empty keys are before the partition point")
482 > ch
483 }) {
484 Some(next_char_index) => first_char_idx += next_char_index,
485 None => break,
486 }
487 }
488 prefix_idx_with_first_char.push(table.len());
491
492 let flags = table
493 .iter()
494 .flat_map(|affix| affix.flags.iter().copied())
495 .collect::<Vec<Flag>>()
496 .into();
497
498 Self {
499 table: table.into(),
500 first_char: first_char.into(),
501 prefix_idx_with_first_char: prefix_idx_with_first_char.into(),
502 all_flags: flags,
503 }
504 }
505}
506
507impl<C: AffixKind> AffixIndex<C> {
508 pub fn affixes_of<'index, 'word>(
509 &'index self,
510 word: &'word str,
511 ) -> AffixesIter<'index, 'word, C> {
512 AffixesIter {
513 table: &self.table,
514 first_char: &self.first_char,
515 prefix_idx_with_first_char: &self.prefix_idx_with_first_char,
516 chars: C::chars(word),
517 chars_matched: 0,
518 }
519 }
520
521 pub fn len(&self) -> usize {
522 self.table.len()
523 }
524
525 pub fn iter(&self) -> core::slice::Iter<Affix<C>> {
526 self.table.iter()
527 }
528}
529
530pub(crate) struct AffixesIter<'index, 'word, C: AffixKind + 'static> {
532 table: &'index [Affix<C>],
533 first_char: &'index [char],
534 prefix_idx_with_first_char: &'index [usize],
535 chars: C::Chars<'word>,
536 chars_matched: usize,
537}
538
539impl<'index, C: AffixKind> Iterator for AffixesIter<'index, '_, C> {
540 type Item = &'index Affix<C>;
541
542 fn next(&mut self) -> Option<Self::Item> {
543 if self.chars_matched == 0 {
549 if self.table.is_empty() {
550 return None;
551 }
552
553 let item = &self.table[0];
554 if item.appending().next().is_some() {
555 let ch = self.chars.next()?;
558 let first_char_idx = self.first_char.iter().position(|c| *c == ch)?;
559
560 let empty_offset = self.prefix_idx_with_first_char[0];
564 let start = self.prefix_idx_with_first_char[first_char_idx] - empty_offset;
568 let end = self.prefix_idx_with_first_char[first_char_idx + 1] - empty_offset;
569 self.table = &self.table[start..end];
570 self.chars_matched = 1;
571 } else {
572 self.table = &self.table[1..];
573 return Some(item);
574 }
575 }
576
577 loop {
578 if self.table.is_empty() {
579 return None;
580 }
581
582 let item = &self.table[0];
585 if item.appending().count() == self.chars_matched {
586 self.table = &self.table[1..];
587 return Some(item);
588 }
589
590 let ch = self.chars.next()?;
594
595 let char_beginning_idx = self
598 .table
599 .iter()
600 .position(|affix| affix.appending().nth(self.chars_matched) == Some(ch))?;
601 self.table = &self.table[char_beginning_idx..];
602
603 let char_end_idx = self
606 .table
607 .partition_point(|affix| affix.appending().nth(self.chars_matched) == Some(ch));
608 self.table = &self.table[..char_end_idx];
609
610 self.chars_matched += 1;
611 }
612 }
613}
614
615#[derive(Debug, Clone)]
623pub(crate) struct BreakTable {
624 table: Box<[Box<str>]>,
625 start_word_breaks_last_idx: usize,
626 middle_word_breaks_last_idx: usize,
629}
630
631impl BreakTable {
632 fn new(breaks: &[&str]) -> Self {
633 let mut start = Vec::new();
634 let mut middle = Vec::new();
635 let mut end = Vec::new();
636
637 for &b in breaks.iter() {
638 assert!(!b.is_empty());
640
641 if let Some(b) = b.strip_prefix('^') {
642 start.push(b.into());
643 } else if let Some(b) = b.strip_suffix('$') {
644 end.push(b.into());
645 } else {
646 middle.push(b.into());
647 }
648 }
649
650 let mut table = start;
651 let start_word_breaks_last_idx = table.len();
652 table.append(&mut middle);
653 let middle_word_breaks_last_idx = table.len();
654 table.append(&mut end);
655
656 Self {
657 table: table.into_boxed_slice(),
658 start_word_breaks_last_idx,
659 middle_word_breaks_last_idx,
660 }
661 }
662
663 #[inline]
664 pub fn start_word_breaks(&self) -> impl Iterator<Item = &str> {
665 self.table[..self.start_word_breaks_last_idx]
666 .iter()
667 .map(AsRef::as_ref)
668 }
669
670 #[inline]
671 pub fn middle_word_breaks(&self) -> impl Iterator<Item = &str> {
672 self.table[self.start_word_breaks_last_idx..self.middle_word_breaks_last_idx]
673 .iter()
674 .map(AsRef::as_ref)
675 }
676
677 #[inline]
678 pub fn end_word_breaks(&self) -> impl Iterator<Item = &str> {
679 self.table[self.middle_word_breaks_last_idx..]
680 .iter()
681 .map(AsRef::as_ref)
682 }
683}
684
685#[derive(Debug, Clone)]
698pub(crate) struct ReplacementTable {
699 table: Box<[(Box<str>, Box<str>)]>,
700 whole_word_replacements_last_idx: usize,
701 start_word_replacements_last_idx: usize,
702 end_word_replacements_last_idx: usize,
703}
704
705impl From<Vec<(&str, String)>> for ReplacementTable {
706 fn from(replacements: Vec<(&str, String)>) -> Self {
707 let mut whole = Vec::new();
708 let mut start = Vec::new();
709 let mut end = Vec::new();
710 let mut anywhere = Vec::new();
711
712 for (from, to) in replacements.into_iter() {
713 assert!(!from.is_empty() && !to.is_empty());
715
716 if let Some(from) = from.strip_prefix('^') {
717 if let Some(from) = from.strip_suffix('$') {
718 whole.push((from.into(), to.into()));
721 } else {
722 start.push((from.into(), to.into()));
724 }
725 } else if let Some(from) = from.strip_suffix('$') {
726 end.push((from.into(), to.into()));
728 } else {
729 anywhere.push((from.into(), to.into()));
731 }
732 }
733
734 let mut table = whole;
735 let whole_word_replacements_last_idx = table.len();
736 table.append(&mut start);
737 let start_word_replacements_last_idx = table.len();
738 table.append(&mut end);
739 let end_word_replacements_last_idx = table.len();
740 table.append(&mut anywhere);
741
742 Self {
743 table: table.into_boxed_slice(),
744 whole_word_replacements_last_idx,
745 start_word_replacements_last_idx,
746 end_word_replacements_last_idx,
747 }
748 }
749}
750
751impl ReplacementTable {
752 #[inline]
753 pub fn whole_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
754 self.table[..self.whole_word_replacements_last_idx]
755 .iter()
756 .map(|(from, to)| (from.as_ref(), to.as_ref()))
757 }
758
759 #[inline]
760 pub fn start_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
761 self.table[self.whole_word_replacements_last_idx..self.start_word_replacements_last_idx]
762 .iter()
763 .map(|(from, to)| (from.as_ref(), to.as_ref()))
764 }
765
766 #[inline]
767 pub fn end_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
768 self.table[self.start_word_replacements_last_idx..self.end_word_replacements_last_idx]
769 .iter()
770 .map(|(from, to)| (from.as_ref(), to.as_ref()))
771 }
772
773 #[inline]
774 pub fn any_place_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
775 self.table[self.end_word_replacements_last_idx..]
776 .iter()
777 .map(|(from, to)| (from.as_ref(), to.as_ref()))
778 }
779
780 #[inline]
782 pub fn has_only_whole_word_replacements(&self) -> bool {
783 self.whole_word_replacements_last_idx == self.table.len()
784 }
785}
786
787#[derive(Debug, Clone, PartialEq, Eq)]
810pub(crate) struct CompoundRuleElement {
811 pub flag: Flag,
812 pub modifier: Option<CompoundRuleModifier>,
813}
814
815#[derive(Debug, Clone, Copy, PartialEq, Eq)]
816pub(crate) enum CompoundRuleModifier {
817 ZeroOrOne,
818 ZeroOrMore,
819}
820
821type CompoundRule = Box<[CompoundRuleElement]>;
822
823fn compound_rule_matches(pattern: &[CompoundRuleElement], data: &[&FlagSet]) -> bool {
824 use crate::alloc::vec;
825 use CompoundRuleModifier::*;
826
827 let mut stack = vec![(0, 0)];
829
830 while let Some((pattern_idx, data_idx)) = stack.pop() {
831 if pattern_idx == pattern.len() {
832 return data_idx == data.len();
833 }
834
835 let flag_matches = match data.get(data_idx) {
842 Some(flagset) => flagset.contains(&pattern[pattern_idx].flag),
843 None => false,
844 };
845 match pattern[pattern_idx].modifier {
846 Some(ZeroOrOne) => {
847 stack.push((pattern_idx + 1, data_idx));
849 if flag_matches {
850 stack.push((pattern_idx + 1, data_idx + 1));
852 }
853 }
854 Some(ZeroOrMore) => {
855 stack.push((pattern_idx + 1, data_idx));
857 if flag_matches {
858 stack.push((pattern_idx, data_idx + 1));
861 }
862 }
863 None => {
864 if flag_matches {
866 stack.push((pattern_idx + 1, data_idx + 1));
867 }
868 }
869 }
870 }
871
872 false
873}
874
875#[derive(Debug, Clone)]
879pub(crate) struct CompoundRuleTable {
880 rules: Box<[CompoundRule]>,
881 all_flags: FlagSet,
882}
883
884impl From<Vec<CompoundRule>> for CompoundRuleTable {
885 fn from(rules: Vec<CompoundRule>) -> Self {
886 let all_flags: Vec<_> = rules
887 .iter()
888 .flat_map(|rule| rule.iter().map(|el| el.flag))
889 .collect();
890
891 Self {
892 rules: rules.into_boxed_slice(),
893 all_flags: all_flags.into(),
894 }
895 }
896}
897
898impl CompoundRuleTable {
899 #[inline]
900 pub fn is_empty(&self) -> bool {
901 self.rules.is_empty()
902 }
903
904 #[inline]
907 pub fn has_any_flags(&self, flagset: &FlagSet) -> bool {
908 self.all_flags.has_intersection(flagset)
909 }
910
911 pub fn any_rule_matches(&self, flagsets: &[&FlagSet]) -> bool {
923 self.rules
924 .iter()
925 .any(|rule| compound_rule_matches(rule, flagsets))
926 }
927}
928
929#[derive(Debug, Clone)]
931pub(crate) struct StrPair {
932 inner: Box<str>,
933 partition: usize,
936}
937
938impl StrPair {
939 pub fn new(left: &str, right: &str) -> Self {
940 let mut string = left.to_string();
941 let partition = string.len();
942 string.push_str(right);
943
944 Self {
945 inner: string.into(),
946 partition,
947 }
948 }
949
950 #[inline]
951 pub fn full_str(&self) -> &str {
952 &self.inner
953 }
954
955 #[inline]
956 pub fn partition(&self) -> usize {
957 self.partition
958 }
959}
960
961#[derive(Debug, Clone)]
962pub(crate) struct CompoundPattern {
963 pub begin_end_chars: StrPair,
964 #[allow(dead_code)]
966 pub replacement: Option<Box<str>>,
967 pub first_word_flag: Option<Flag>,
968 pub second_word_flag: Option<Flag>,
969 pub match_first_only_unaffixed_or_zero_affixed: bool,
970}
971
972#[derive(Debug, Clone)]
973struct Conversion {
974 from: Box<str>,
975 to: Box<str>,
976 anchor_end: bool,
979}
980
981impl Conversion {
982 fn find(&self, word: &str) -> Option<usize> {
983 if word.len() < self.from.len() {
984 return None;
985 }
986
987 if self.anchor_end {
988 word.ends_with(&*self.from)
989 .then_some(word.len() - self.from.len())
990 } else {
991 word.find(&*self.from)
992 }
993 }
994}
995
996#[derive(Debug, Clone)]
1003pub(crate) struct ConversionTable {
1004 inner: Box<[Conversion]>,
1005}
1006
1007impl From<Vec<(&str, &str)>> for ConversionTable {
1008 fn from(mut table: Vec<(&str, &str)>) -> Self {
1009 table.sort_unstable_by_key(|(from, _to)| core::cmp::Reverse(*from));
1011
1012 Self {
1013 inner: table
1014 .into_iter()
1015 .map(|(from, to)| {
1016 let (from, anchor_end) = if let Some(from) = from.strip_suffix('_') {
1017 (from, true)
1018 } else {
1019 (from, false)
1020 };
1021 Conversion {
1022 to: to.into(),
1023 from: from.into(),
1024 anchor_end,
1025 }
1026 })
1027 .collect(),
1028 }
1029 }
1030}
1031
1032#[derive(Debug)]
1035struct ConversionMatch<'a> {
1036 start: usize,
1038 from: &'a str,
1040 replacement: &'a str,
1042}
1043
1044impl ConversionTable {
1045 fn find_match<'a>(&'a self, word: &str, offset: usize) -> Option<ConversionMatch<'a>> {
1048 let mut mat: Option<ConversionMatch> = None;
1049
1050 for conversion in self.inner.iter() {
1051 let start = match conversion.find(&word[offset..]) {
1052 Some(idx) => idx + offset,
1053 None => continue,
1054 };
1055
1056 if mat.is_none()
1058 || mat.as_ref().is_some_and(|mat| {
1059 start < mat.start
1060 || (start == mat.start && conversion.from.len() > mat.from.len())
1061 })
1062 {
1063 mat = Some(ConversionMatch {
1064 start,
1065 from: &conversion.from,
1066 replacement: &conversion.to,
1067 })
1068 }
1069 }
1070
1071 mat
1072 }
1073
1074 pub fn convert<'a>(&self, word: &'a str) -> Cow<'a, str> {
1078 let mut word = Cow::Borrowed(word);
1079
1080 let mut i = 0;
1081 while let Some(conversion) = self.find_match(&word, i) {
1082 let mut string = word.into_owned();
1083 let range = conversion.start..conversion.start + conversion.from.len();
1085 string.replace_range(range, conversion.replacement);
1086 word = Cow::Owned(string);
1087 i = conversion.start + conversion.replacement.len();
1088 }
1089
1090 word
1091 }
1092}
1093
1094#[derive(Debug, Default, Clone, Copy)]
1095pub(crate) enum CaseHandling {
1096 Turkic,
1099 #[default]
1101 Standard,
1102}
1103
1104impl CaseHandling {
1105 pub fn lowercase(&self, word: &str) -> String {
1112 match self {
1113 Self::Turkic => word.replace('I', "ı").replace('İ', "i").to_lowercase(),
1114 Self::Standard => word.to_lowercase(),
1115 }
1116 }
1117
1118 pub fn uppercase(&self, word: &str) -> String {
1119 match self {
1120 Self::Turkic => word.replace('i', "İ").replace('ı', "I").to_uppercase(),
1121 Self::Standard => word.to_uppercase(),
1122 }
1123 }
1124
1125 pub fn titlecase(&self, word: &str) -> String {
1126 let mut output = String::with_capacity(word.len());
1127 let mut chars = word.chars();
1128 let first = chars.next().expect("non-empty input");
1129 match self {
1130 Self::Turkic if first == 'i' => output.push('İ'),
1131 Self::Turkic if first == 'ı' => output.push('I'),
1132 _ => output.extend(first.to_uppercase()),
1133 }
1134 for ch in chars {
1135 match self {
1136 Self::Turkic if ch == 'I' => output.push('ı'),
1137 Self::Turkic if ch == 'İ' => output.push('i'),
1138 _ => output.extend(ch.to_lowercase()),
1139 }
1140 }
1141 output
1142 }
1143
1144 pub fn lower_first_char(&self, word: &str) -> String {
1145 let mut output = String::with_capacity(word.len());
1146 let mut chars = word.char_indices();
1147 let (_, first) = chars.next().expect("non-empty input");
1148 match self {
1149 Self::Turkic if first == 'I' => output.push('ı'),
1150 Self::Turkic if first == 'İ' => output.push('i'),
1151 _ => output.extend(first.to_lowercase()),
1152 }
1153 if let Some((idx, _)) = chars.next() {
1154 output.push_str(&word[idx..]);
1155 }
1156 output
1157 }
1158
1159 pub fn upper_char_at(&self, word: &str, idx: usize) -> String {
1160 let mut output = String::with_capacity(word.len());
1161 output.push_str(&word[..idx]);
1162 let mut chars = word[idx..].char_indices();
1163 let (_, ch) = chars.next().expect("a char at the given index");
1164 match self {
1165 Self::Turkic if ch == 'ı' => output.push('I'),
1166 Self::Turkic if ch == 'i' => output.push('İ'),
1167 _ => output.extend(ch.to_uppercase()),
1168 }
1169 if let Some((next_idx, _)) = chars.next() {
1170 output.push_str(&word[idx + next_idx..]);
1171 }
1172 output
1173 }
1174
1175 pub fn is_char_eq_lowercase(&self, left: char, right: char) -> bool {
1177 match (self, left, right) {
1178 (Self::Turkic, 'ı', 'I') => return true,
1179 (Self::Turkic, 'i', 'İ') => return true,
1180 _ => (),
1181 }
1182
1183 let mut lower_iter = right.to_lowercase();
1184 if lower_iter.len() != 1 {
1185 return false;
1186 }
1187 let lower = lower_iter.next().unwrap();
1188 left == lower
1189 }
1190}
1191
1192#[derive(Debug, Clone)]
1193pub(crate) struct SimilarityGroup {
1194 pub chars: Box<str>,
1195 pub strings: Box<[Box<str>]>,
1196}
1197
1198#[derive(Debug, Clone)]
1199pub(crate) struct AffData {
1200 pub prefixes: PrefixIndex,
1202 pub suffixes: SuffixIndex,
1203 pub break_table: BreakTable,
1204 pub compound_rules: CompoundRuleTable,
1205 pub compound_syllable_vowels: Box<str>,
1206 pub compound_patterns: Box<[CompoundPattern]>,
1207 pub input_conversions: ConversionTable,
1208 pub output_conversions: ConversionTable,
1209 pub replacements: ReplacementTable,
1212 pub similarities: Box<[SimilarityGroup]>,
1213 pub ignore_chars: Box<[char]>,
1215 pub keyboard_closeness: Box<str>,
1216 pub try_chars: Box<str>,
1217 pub options: AffOptions,
1218 pub flag_type: FlagType,
1220 pub flag_aliases: Box<[FlagSet]>,
1221}
1222
1223#[derive(Debug, Clone, Copy)]
1224pub(crate) struct AffOptions {
1225 pub complex_prefixes: bool,
1226 pub fullstrip: bool,
1227 pub checksharps: bool,
1228 pub forbid_warn: bool,
1229 pub only_in_compound_flag: Option<Flag>,
1230 pub circumfix_flag: Option<Flag>,
1231 pub forbidden_word_flag: Option<Flag>,
1232 pub keep_case_flag: Option<Flag>,
1233 pub need_affix_flag: Option<Flag>,
1234 pub warn_flag: Option<Flag>,
1235 pub compound_flag: Option<Flag>,
1237 pub compound_begin_flag: Option<Flag>,
1238 pub compound_middle_flag: Option<Flag>,
1239 pub compound_end_flag: Option<Flag>,
1240 pub compound_min_length: Option<NonZeroU16>,
1243 pub compound_max_word_count: Option<NonZeroU16>,
1244 pub compound_permit_flag: Option<Flag>,
1245 pub compound_forbid_flag: Option<Flag>,
1246 pub compound_root_flag: Option<Flag>,
1247 pub compound_force_uppercase_flag: Option<Flag>,
1248 pub compound_more_suffixes: bool,
1249 pub compound_check_duplicate: bool,
1250 pub compound_check_rep: bool,
1251 pub compound_check_case: bool,
1252 pub compound_check_triple: bool,
1253 pub compound_simplified_triple: bool,
1254 pub compound_syllable_num: bool,
1255 pub compound_syllable_max: Option<NonZeroU16>,
1256 pub max_compound_suggestions: u16,
1257 pub no_suggest_flag: Option<Flag>,
1258 pub substandard_flag: Option<Flag>,
1259 pub max_ngram_suggestions: u16,
1260 pub max_diff_factor: u16,
1261 pub only_max_diff: bool,
1262 pub no_split_suggestions: bool,
1263 pub suggest_with_dots: bool,
1264 pub case_handling: CaseHandling,
1265}
1266
1267impl Default for AffOptions {
1268 fn default() -> Self {
1269 Self {
1270 complex_prefixes: Default::default(),
1271 fullstrip: Default::default(),
1272 checksharps: Default::default(),
1273 forbid_warn: Default::default(),
1274 only_in_compound_flag: Default::default(),
1275 circumfix_flag: Default::default(),
1276 forbidden_word_flag: Default::default(),
1277 keep_case_flag: Default::default(),
1278 need_affix_flag: Default::default(),
1279 warn_flag: Default::default(),
1280 compound_flag: Default::default(),
1281 compound_begin_flag: Default::default(),
1282 compound_middle_flag: Default::default(),
1283 compound_end_flag: Default::default(),
1284 compound_min_length: Default::default(),
1285 compound_max_word_count: Default::default(),
1286 compound_permit_flag: Default::default(),
1287 compound_forbid_flag: Default::default(),
1288 compound_root_flag: Default::default(),
1289 compound_force_uppercase_flag: Default::default(),
1290 compound_more_suffixes: Default::default(),
1291 compound_check_duplicate: Default::default(),
1292 compound_check_rep: Default::default(),
1293 compound_check_case: Default::default(),
1294 compound_check_triple: Default::default(),
1295 compound_simplified_triple: Default::default(),
1296 compound_syllable_num: Default::default(),
1297 compound_syllable_max: Default::default(),
1298 max_compound_suggestions: 3,
1299 no_suggest_flag: Default::default(),
1300 substandard_flag: Default::default(),
1301 max_ngram_suggestions: 5,
1302 max_diff_factor: 5,
1303 only_max_diff: Default::default(),
1304 no_split_suggestions: Default::default(),
1305 suggest_with_dots: Default::default(),
1306 case_handling: Default::default(),
1307 }
1308 }
1309}
1310
1311impl AffOptions {
1312 pub fn allows_compounding(&self) -> bool {
1313 self.compound_flag.is_some()
1314 || self.compound_begin_flag.is_some()
1315 || self.compound_middle_flag.is_some()
1316 || self.compound_end_flag.is_some()
1317 }
1318}
1319
1320#[cfg(test)]
1321mod test {
1322 use super::*;
1323
1324 macro_rules! flag {
1325 ( $x:expr ) => {{
1326 Flag::new($x as u16).unwrap()
1327 }};
1328 }
1329 macro_rules! flagset {
1330 () => {{
1331 FlagSet::default()
1332 }};
1333 ( $( $x:expr ),* ) => {
1334 {
1335 FlagSet::from( $crate::alloc::vec![ $( Flag::new( $x as u16 ).unwrap() ),* ] )
1336 }
1337 };
1338 }
1339
1340 #[test]
1341 fn condition_matches() {
1342 assert!("foo".parse::<Condition>().unwrap().matches("foo"));
1344
1345 assert!(!"foo".parse::<Condition>().unwrap().matches("fo"));
1347
1348 let condition = "xx[abc]x".parse::<Condition>().unwrap();
1350 assert!(condition.matches("xxax"));
1351 assert!(condition.matches("xxbx"));
1352 assert!(condition.matches("xxcx"));
1353 assert!(!condition.matches("xxdx"));
1354
1355 let condition = "xx[^abc]x".parse::<Condition>().unwrap();
1357 assert!(!condition.matches("xxax"));
1358 assert!(!condition.matches("xxbx"));
1359 assert!(!condition.matches("xxcx"));
1360 assert!(condition.matches("xxdx"));
1361 }
1362
1363 #[test]
1364 fn condition_nuspell_unit_test() {
1365 let cond = "abcd".parse::<Condition>().unwrap();
1368 assert!(cond.matches("abcd"));
1369 assert!(cond.matches("abcdXYZ"));
1370 assert!(cond.matches("abcdБВГДШ\u{ABCD}\u{10ABCD}"));
1371 assert!(!cond.matches(""));
1372 assert!(!cond.matches("abc"));
1373 assert!(!cond.matches("abcX"));
1374 assert!(!cond.matches("XYZ"));
1375 assert!(!cond.matches("АаБбВвГгШш\u{ABCD}\u{10ABCD}"));
1376
1377 let cond = "[vbn]".parse::<Condition>().unwrap();
1378 assert!(cond.matches("v"));
1379 assert!(cond.matches("vAAш"));
1380 assert!(cond.matches("b"));
1381 assert!(cond.matches("bBBш"));
1382 assert!(cond.matches("n"));
1383 assert!(cond.matches("nCCш"));
1384 assert!(!cond.matches(""));
1385 assert!(!cond.matches("Q"));
1386 assert!(!cond.matches("Qqqq"));
1387 assert!(!cond.matches("1342234"));
1388 assert!(!cond.matches("V"));
1389 assert!(!cond.matches("бвгдш"));
1390
1391 let cond = "[бш\u{1234}]".parse::<Condition>().unwrap();
1392 assert!(cond.matches("б"));
1393 assert!(cond.matches("бeT"));
1394 assert!(cond.matches("ш"));
1395 assert!(cond.matches("шок"));
1396 assert!(cond.matches("\u{1234}кош"));
1397 assert!(!cond.matches(""));
1398 assert!(!cond.matches("Q"));
1399 assert!(!cond.matches("Qqqq"));
1400 assert!(!cond.matches("пан"));
1401 assert!(!cond.matches("\u{ABCD}\u{1234}"));
1402 assert!(!cond.matches("вбгдш"));
1403
1404 let cond = "[^zш\u{1234}\u{10ABCD}]".parse::<Condition>().unwrap();
1405 assert!(!cond.matches("z"));
1406 assert!(!cond.matches("ш"));
1407 assert!(!cond.matches("\u{1234}"));
1408 assert!(!cond.matches("\u{10ABCD}"));
1409 assert!(!cond.matches("zљње"));
1410 assert!(!cond.matches("шabc"));
1411 assert!(!cond.matches("\u{1234} ytyty"));
1412 assert!(!cond.matches("\u{10ABCD} tytyty"));
1413 assert!(cond.matches("q"));
1414 assert!(cond.matches("r"));
1415 assert!(cond.matches("\u{FFFD}"));
1416 assert!(cond.matches("\u{10FFFF}"));
1417 assert!(cond.matches("qљње"));
1418 assert!(cond.matches("фabc"));
1419 assert!(cond.matches("\u{FFFD} ytyty"));
1420 assert!(cond.matches("\u{10FFFF} tytyty"));
1421
1422 let cond = "abc АБВ..[zбш\u{1234}][^zш\u{1234}\u{10ABCD}]X"
1423 .parse::<Condition>()
1424 .unwrap();
1425 assert!(cond.matches("abc АБВ \u{2345}z\u{011111}X"));
1426 assert!(cond.matches("abc АБВ\u{2345} ш\u{011112}Xопop"));
1427 assert!(!cond.matches("abc ШШШ \u{2345}z\u{011111}X"));
1428 assert!(!cond.matches("abc АБВ\u{2345} t\u{011112}Xопop"));
1429 assert!(!cond.matches("abc АБВ \u{2345}z\u{1234}X"));
1430 }
1431
1432 #[test]
1433 fn string_pair() {
1434 let pair = StrPair::new("foo", "bar");
1435 assert_eq!(pair.full_str(), "foobar");
1436
1437 let pair = StrPair::new("", "");
1438 assert_eq!(pair.full_str(), "")
1439 }
1440
1441 #[test]
1442 fn break_table_nuspell_unit_test() {
1443 let table = BreakTable::new(&[
1445 "bsd", "zxc", "asd", "^bar", "^zoo", "^abc", "car$", "yoyo$", "air$",
1446 ]);
1447
1448 let mut starts: Vec<_> = table.start_word_breaks().collect();
1449 starts.sort_unstable();
1450 assert_eq!(&["abc", "bar", "zoo"], starts.as_slice());
1451
1452 let mut middles: Vec<_> = table.middle_word_breaks().collect();
1453 middles.sort_unstable();
1454 assert_eq!(&["asd", "bsd", "zxc"], middles.as_slice());
1455
1456 let mut ends: Vec<_> = table.end_word_breaks().collect();
1457 ends.sort_unstable();
1458 assert_eq!(&["air", "car", "yoyo"], ends.as_slice());
1459 }
1460
1461 #[test]
1462 fn prefix_suffix_nuspell_unit_test() {
1463 let prefix = Prefix::new(flag!('F'), false, Some("qw"), "Qwe", None, flagset![]).unwrap();
1465 assert_eq!(prefix.to_derived("qwrty").as_str(), "Qwerty");
1466 assert_eq!(prefix.to_stem("Qwerty").as_ref(), "qwrty");
1467
1468 let suffix = Suffix::new(flag!('F'), false, Some("ie"), "ying", None, flagset![]).unwrap();
1469 assert_eq!(suffix.to_derived("pie").as_str(), "pying");
1470 assert_eq!(suffix.to_stem("pying").as_ref(), "pie");
1471 }
1472
1473 #[test]
1474 fn empty_affix_index() {
1475 let index: PrefixIndex = [].into_iter().collect();
1476 assert!(index.affixes_of("anything").next().is_none());
1477
1478 let index: SuffixIndex = [].into_iter().collect();
1479 assert!(index.affixes_of("anything").next().is_none());
1480 }
1481
1482 #[test]
1483 fn affix_index_prefix_multiset_nuspell_unit_test() {
1484 fn prefix(add: &str) -> Prefix {
1486 Prefix::new(Flag::new(1).unwrap(), true, None, add, None, flagset![]).unwrap()
1487 }
1488
1489 let index: PrefixIndex = [
1490 "", "a", "", "ab", "abx", "as", "asdf", "axx", "as", "bqwe", "ba", "rqwe",
1491 ]
1492 .into_iter()
1493 .map(prefix)
1494 .collect();
1495
1496 let prefixes: Vec<_> = index
1497 .affixes_of("asdfg")
1498 .map(|prefix| prefix.add.as_ref())
1499 .collect();
1500
1501 assert_eq!(&["", "", "a", "as", "as", "asdf"], prefixes.as_slice());
1502 }
1503
1504 #[test]
1505 fn affix_index_suffix_multiset_nuspell_unit_test() {
1506 fn suffix(add: &str) -> Suffix {
1508 Suffix::new(Flag::new(1).unwrap(), true, None, add, None, flagset![]).unwrap()
1509 }
1510
1511 let index: SuffixIndex = [
1512 "", "", "a", "b", "b", "ab", "ub", "zb", "aub", "uub", "xub", "huub",
1513 ]
1514 .into_iter()
1515 .map(suffix)
1516 .collect();
1517
1518 let suffixes: Vec<_> = index
1519 .affixes_of("ahahuub")
1520 .map(|suffix| suffix.add.as_ref())
1521 .collect();
1522
1523 assert_eq!(
1524 &["", "", "b", "b", "ub", "uub", "huub"],
1525 suffixes.as_slice()
1526 );
1527 }
1528
1529 #[test]
1530 fn affix_index_en_us_suffix_example() {
1531 let flag = Flag::new('D' as u16).unwrap();
1539 let suffix1 = Suffix::new(flag, true, None, "d", Some("e"), flagset![]).unwrap();
1540 let suffix2 =
1541 Suffix::new(flag, true, Some("y"), "ied", Some("[^aeiou]y"), flagset![]).unwrap();
1542 let suffix3 = Suffix::new(flag, true, None, "ed", Some("[^ey]"), flagset![]).unwrap();
1543 let suffix4 = Suffix::new(flag, true, None, "ed", Some("[aeiou]y"), flagset![]).unwrap();
1544
1545 let index: SuffixIndex = [&suffix1, &suffix2, &suffix3, &suffix4]
1546 .into_iter()
1547 .cloned()
1548 .collect();
1549
1550 let word = "aced";
1554 let affixes: Vec<&Suffix> = index.affixes_of(word).collect();
1555 assert_eq!(&[&suffix1, &suffix3, &suffix4], affixes.as_slice());
1556
1557 let stem1 = suffix1.to_stem(word);
1561 assert_eq!(&stem1, "ace");
1562 assert!(suffix1.condition_matches(&stem1));
1563
1564 let stem3 = suffix3.to_stem(word);
1565 assert_eq!(&stem3, "ac");
1566 assert!(suffix3.condition_matches(&stem3));
1567
1568 let stem4 = suffix4.to_stem(word);
1569 assert_eq!(&stem4, "ac");
1570 assert!(!suffix4.condition_matches(&stem4));
1571 }
1572
1573 fn compound_rule_matches(pattern: &[CompoundRuleElement], data: &str) -> bool {
1574 let flagsets: Vec<_> = data.chars().map(|ch| flagset!(ch)).collect();
1575 let borrowed: Vec<_> = flagsets.iter().collect();
1576 super::compound_rule_matches(pattern, &borrowed)
1577 }
1578
1579 #[test]
1580 fn compound_rule_matches_literal() {
1581 let rule = parser::parse_compound_rule("abc", FlagType::default()).unwrap();
1582
1583 assert!(compound_rule_matches(&rule, "abc"));
1584
1585 assert!(!compound_rule_matches(&rule, "ac"));
1586 assert!(!compound_rule_matches(&rule, "abcd"));
1587 }
1588
1589 #[test]
1590 fn compound_rule_matches_zero_or_one() {
1591 let rule = parser::parse_compound_rule("ab?c", FlagType::default()).unwrap();
1592
1593 assert!(compound_rule_matches(&rule, "ac"));
1594 assert!(compound_rule_matches(&rule, "abc"));
1595
1596 assert!(!compound_rule_matches(&rule, "ab"));
1597 assert!(!compound_rule_matches(&rule, "bc"));
1598 assert!(!compound_rule_matches(&rule, "abb"));
1599 assert!(!compound_rule_matches(&rule, "abbc"));
1600 }
1601
1602 #[test]
1603 fn compound_rule_matches_zero_or_more() {
1604 let rule = parser::parse_compound_rule("ab*c", FlagType::default()).unwrap();
1605
1606 assert!(compound_rule_matches(&rule, "ac"));
1607 assert!(compound_rule_matches(&rule, "abc"));
1608 assert!(compound_rule_matches(&rule, "abbc"));
1609 assert!(compound_rule_matches(&rule, "abbbc"));
1610 assert!(!compound_rule_matches(&rule, "ab"));
1613 assert!(!compound_rule_matches(&rule, "abb"));
1614 assert!(!compound_rule_matches(&rule, "abbcc"));
1615 }
1616
1617 #[test]
1618 fn compound_rule_simple_regex_nuspell_unit_test() {
1619 let rule = parser::parse_compound_rule("abc?de*ff", FlagType::default()).unwrap();
1621
1622 assert!(compound_rule_matches(&rule, "abdff"));
1623 assert!(compound_rule_matches(&rule, "abcdff"));
1624 assert!(compound_rule_matches(&rule, "abdeeff"));
1625 assert!(compound_rule_matches(&rule, "abcdeff"));
1626
1627 assert!(!compound_rule_matches(&rule, "abcdeeeefff"));
1628 assert!(!compound_rule_matches(&rule, "qwerty"));
1629 }
1630
1631 #[test]
1632 fn casing_conversions_nuspell_unit_test() {
1633 let word = "grüßEN";
1635 assert_eq!(&CaseHandling::default().lowercase(word), "grüßen");
1636 assert_eq!(&CaseHandling::default().uppercase(word), "GRÜSSEN");
1637 assert_eq!(&CaseHandling::default().titlecase(word), "Grüßen");
1638
1639 let word = "isTAnbulI";
1640 assert_eq!(&CaseHandling::default().lowercase(word), "istanbuli");
1641 assert_eq!(&CaseHandling::default().uppercase(word), "ISTANBULI");
1642 assert_eq!(&CaseHandling::default().titlecase(word), "Istanbuli");
1643 assert_eq!(&CaseHandling::Turkic.lowercase(word), "istanbulı");
1644 assert_eq!(&CaseHandling::Turkic.uppercase(word), "İSTANBULI");
1645 assert_eq!(&CaseHandling::Turkic.titlecase(word), "İstanbulı");
1646 }
1647}