spellbook/
aff.rs

1pub(crate) mod parser;
2
3use crate::{
4    alloc::{
5        borrow::Cow,
6        boxed::Box,
7        string::{String, ToString},
8        vec::Vec,
9    },
10    AffixingMode, Flag, FlagSet, AT_COMPOUND_BEGIN, AT_COMPOUND_END, FULL_WORD,
11};
12
13use core::{marker::PhantomData, num::NonZeroU16, str::Chars};
14
15pub(crate) const HIDDEN_HOMONYM_FLAG: Flag = unsafe { Flag::new_unchecked(u16::MAX) };
16pub(crate) const MAX_SUGGESTIONS: usize = 16;
17
18macro_rules! has_flag {
19    ( $flags:expr, $flag:expr ) => {{
20        match $flag {
21            Some(flag) => $flags.contains(&flag),
22            None => false,
23        }
24    }};
25}
26
27/// The representation of a flag in a `.dic` or `.aff` file.
28///
29/// This representation also decides how we encode flags into `Flag`. This is controlled by the
30/// `FLAG` directive in a `.aff` file.
31#[derive(Debug, Default, Clone, Copy)]
32pub(crate) enum FlagType {
33    /// A single ascii character.
34    ///
35    /// This is the default representation if a `.aff` file does not specify another.
36    #[default]
37    Short,
38    /// Two adjacent ascii characters.
39    ///
40    /// The french dictionary uses this. For example for some proper nouns like `Asimov/L'D'Q'`:
41    /// `L'` is a flag, `D'` another, `Q'` another.
42    Long,
43    /// A number in the range `1..=65000`.
44    ///
45    /// We will approximate this to `2^16`. Numeric flags are separated by commas.
46    /// For example `actionfilm/70,7,252,976` from the Danish dictionary.
47    Numeric,
48    /// One UTF8 character described by at most two bytes.
49    Utf8,
50}
51
52#[derive(Debug, PartialEq, Eq, Clone)]
53pub(crate) struct Condition {
54    /// The input pattern.
55    ///
56    /// The condition string is not transformed or compiled into a different input. We'll iterate
57    /// over it directly to attempt to match the pattern.
58    ///
59    /// This string is non-empty.
60    pattern: Box<str>,
61    /// The number of `char`s that the pattern describes.
62    ///
63    /// `Condition` is such a small subset of regex that we can tell only from a linear scan of
64    /// the input how many characters we will attempt to match.
65    chars: usize,
66}
67
68impl Condition {
69    pub fn matches(&self, input: &str) -> bool {
70        let mut input = input.chars().peekable();
71        let mut pattern = self.pattern.chars().peekable();
72
73        loop {
74            match (pattern.next(), input.next()) {
75                // If we're at the end of both inputs or the pattern is shorter, this is a match.
76                (None, _) => return true,
77                (Some(_), None) => return false,
78                // Wildcard: skip the input character.
79                (Some('.'), Some(_)) => (),
80                // Character classes
81                (Some('['), Some(input_ch)) => {
82                    let mut found = false;
83                    let negative = pattern.next_if_eq(&'^').is_some();
84
85                    for ch in pattern.by_ref() {
86                        if ch == ']' {
87                            break;
88                        }
89
90                        if ch == input_ch {
91                            found = true;
92                        }
93                    }
94
95                    // If it's a positive character class and the character isn't a member,
96                    // this is not a match.
97                    if !negative && !found {
98                        return false;
99                    }
100                    // If it's a negative character class and the character _is_ a member,
101                    // this is not a match.
102                    if negative && found {
103                        return false;
104                    }
105                }
106                // Literals: the pattern character must equal the input character.
107                (Some(pattern_ch), Some(input_ch)) => {
108                    if pattern_ch != input_ch {
109                        return false;
110                    }
111                }
112            }
113        }
114    }
115}
116
117/// Internal container type for a prefix or suffix.
118#[derive(Debug, PartialEq, Eq, Clone)]
119pub(crate) struct Affix<K> {
120    /// The flag that words may use to reference this affix.
121    pub flag: Flag,
122    /// Whether the affix is compatible with the opposite affix. For example a word that has both
123    /// a prefix and a suffix, both the prefix and suffix should have `crossproduct: true`.
124    pub crossproduct: bool,
125    /// What is stripped from the stem when the affix is applied.
126    pub strip: Option<Box<str>>,
127    /// What should be added when the affix is applied.
128    pub add: Box<str>,
129    /// Condition that the stem should be checked against to query if the affix is relevant.
130    ///
131    /// This is optional in Spellbook. Hunspell and Nuspell represent what we say is `None` as
132    /// `"."`. It's a pattern that always matches the input since the input to `condition_matches`
133    /// is never empty.
134    condition: Option<Condition>,
135    /// Continuation flags.
136    ///
137    /// These are included with the `add` in `.aff` files (separated by `/`).
138    // TODO: document how they're used.
139    pub flags: FlagSet,
140    phantom_data: PhantomData<K>,
141}
142
143impl<K: AffixKind> Affix<K> {
144    pub fn new(
145        flag: Flag,
146        crossproduct: bool,
147        strip: Option<&str>,
148        add: &str,
149        condition: Option<&str>,
150        flags: FlagSet,
151    ) -> Result<Self, parser::ConditionError> {
152        let condition = condition.map(str::parse).transpose()?;
153
154        Ok(Self {
155            flag,
156            crossproduct,
157            strip: strip.map(|str| str.into()),
158            add: add.into(),
159            flags,
160            condition,
161            phantom_data: PhantomData,
162        })
163    }
164
165    pub fn appending(&self) -> K::Chars<'_> {
166        K::chars(&self.add)
167    }
168
169    #[inline]
170    pub fn is_modifying(&self) -> bool {
171        self.strip.is_some() || !self.add.is_empty()
172    }
173}
174
175#[derive(Debug, PartialEq, Eq, Clone, Copy)]
176pub(crate) struct Pfx;
177#[derive(Debug, PartialEq, Eq, Clone, Copy)]
178pub(crate) struct Sfx;
179
180/// Rules for replacing characters at the beginning of a stem.
181pub(crate) type Prefix = Affix<Pfx>;
182/// Rules for replacing characters at the end of a stem.
183pub(crate) type Suffix = Affix<Sfx>;
184
185/// A helper trait that, together with `Pfx` and `Sfx`, allows generically reading either
186/// characters of a `&str` forwards or backwards.
187///
188/// This is a textbook ["lending iterator"] which uses a generic associated type to express that
189/// the lifetime of the iterator is bound only to the input word.
190///
191/// ["lending iterator"]: https://rust-lang.github.io/generic-associated-types-initiative/design_patterns/iterable.html
192pub(crate) trait AffixKind {
193    type Chars<'a>: Iterator<Item = char>
194    where
195        Self: 'a;
196
197    fn chars(word: &str) -> Self::Chars<'_>;
198
199    // Reversed form of `affix_NOT_valid` from Nuspell.
200    fn is_valid<const MODE: AffixingMode>(affix: &Affix<Self>, options: &AffOptions) -> bool
201    where
202        Self: Sized;
203}
204
205impl AffixKind for Pfx {
206    type Chars<'a> = Chars<'a>;
207
208    fn chars(word: &str) -> Self::Chars<'_> {
209        word.chars()
210    }
211
212    fn is_valid<const MODE: AffixingMode>(prefix: &Prefix, options: &AffOptions) -> bool {
213        if MODE == FULL_WORD && has_flag!(prefix.flags, options.only_in_compound_flag) {
214            return false;
215        }
216
217        if MODE == AT_COMPOUND_END && !has_flag!(prefix.flags, options.compound_permit_flag) {
218            return false;
219        }
220
221        if MODE != FULL_WORD && has_flag!(prefix.flags, options.compound_forbid_flag) {
222            return false;
223        }
224
225        true
226    }
227}
228
229impl AffixKind for Sfx {
230    type Chars<'a> = core::iter::Rev<Chars<'a>>;
231
232    fn chars(word: &str) -> Self::Chars<'_> {
233        word.chars().rev()
234    }
235
236    fn is_valid<const MODE: AffixingMode>(suffix: &Suffix, options: &AffOptions) -> bool {
237        if MODE == FULL_WORD && has_flag!(suffix.flags, options.only_in_compound_flag) {
238            return false;
239        }
240
241        if MODE == AT_COMPOUND_BEGIN && !has_flag!(suffix.flags, options.compound_permit_flag) {
242            return false;
243        }
244
245        if MODE != FULL_WORD && has_flag!(suffix.flags, options.compound_forbid_flag) {
246            return false;
247        }
248
249        true
250    }
251}
252
253impl Prefix {
254    /// Converts a word which starts with this `Prefix` to the word's stem.
255    ///
256    /// The prefix's `add` is removed from the beginning and replaced with the `strip`.
257    ///
258    /// Nuspell calls this `to_root`.
259    ///
260    /// # Panics
261    ///
262    /// This function `expect`s that the `Prefix`'s `add` is a prefix of the input `word`.
263    pub fn to_stem<'a>(&self, word: &'a str) -> Cow<'a, str> {
264        let stripped = word
265            .strip_prefix(&*self.add)
266            .expect("to_stem should only be called when the `add` is a prefix of the word");
267
268        match &self.strip {
269            Some(strip) => {
270                let mut stem = strip.to_string();
271                stem.push_str(stripped);
272                Cow::Owned(stem)
273            }
274            None => Cow::Borrowed(stripped),
275        }
276    }
277
278    /// Converts a stem into a word starting with this `Prefix`.
279    ///
280    /// This prefix's `strip` is removed from the beginning and replaced with the `add`. This is
281    /// the inverse of `Prefix::to_stem`.
282    ///
283    /// Nuspell calls this `to_derived.`
284    ///
285    /// # Panics
286    ///
287    /// This function `expect`s that the given `word` starts with this `Prefix`'s `strip`, if this
288    /// prefix has a `strip`.
289    pub fn to_derived(&self, word: &str) -> String {
290        let stripped = match &self.strip {
291            Some(strip) => word
292                .strip_prefix(&**strip)
293                .expect("to_derived should only be called when `strip` is a prefix of the word"),
294            None => word,
295        };
296        let mut stem = self.add.to_string();
297        stem.push_str(stripped);
298        stem
299    }
300
301    pub fn condition_matches(&self, word: &str) -> bool {
302        let condition = match self.condition.as_ref() {
303            Some(condition) => condition,
304            None => return true,
305        };
306
307        // Length in bytes is greater than or equal to length in chars.
308        if word.len() < condition.chars {
309            return false;
310        }
311
312        condition.matches(word)
313    }
314}
315
316impl Suffix {
317    /// Converts a word which ends with this `Suffix` to the word's stem.
318    ///
319    /// This suffix's `add` is removed from the end and replaced with the `strip`.
320    ///
321    /// Nuspell calls this `to_root`.
322    ///
323    /// # Panics
324    ///
325    /// This function `expect`s that the `Suffix`'s `add` is a suffix of the input `word`.
326    pub fn to_stem<'a>(&self, word: &'a str) -> Cow<'a, str> {
327        let stripped = word
328            .strip_suffix(&*self.add)
329            .expect("to_stem should only be called when the `add` is a suffix of the word");
330
331        match self.strip.as_deref() {
332            Some(strip) => {
333                let mut stem = stripped.to_string();
334                stem.push_str(strip);
335                Cow::Owned(stem)
336            }
337            None => Cow::Borrowed(stripped),
338        }
339    }
340
341    /// Converts a stem into a word starting with this `Suffix`.
342    ///
343    /// This suffix's `strip` is removed from the end and replaced with the `add`. This is
344    /// the inverse of `Suffix::to_stem`.
345    ///
346    /// Nuspell calls this `to_derived.`
347    ///
348    /// # Panics
349    ///
350    /// This function `expect`s that the given `word` ends with this `Suffix`'s `strip`, if this
351    /// suffix has a `strip`.
352    pub fn to_derived(&self, word: &str) -> String {
353        let mut stem = match &self.strip {
354            Some(strip) => word
355                .strip_suffix(&**strip)
356                .expect("to_derived should only be called when `strip` is a prefix of the word"),
357            None => word,
358        }
359        .to_string();
360        stem.push_str(&self.add);
361        stem
362    }
363
364    pub fn condition_matches(&self, word: &str) -> bool {
365        let condition = match self.condition.as_ref() {
366            Some(condition) => condition,
367            None => return true,
368        };
369
370        // Length in bytes is greater than or equal to length in chars.
371        let len_bytes = word.len();
372        if len_bytes < condition.chars {
373            return false;
374        }
375
376        let (chars, bytes) = word
377            .char_indices()
378            .rev()
379            .take(condition.chars)
380            .fold((0, 0), |(chars, _bytes), (byte_index, _ch)| {
381                (chars + 1, len_bytes - byte_index)
382            });
383
384        if chars < condition.chars {
385            return false;
386        }
387        condition.matches(&word[word.len() - bytes..])
388    }
389}
390
391pub(crate) type PrefixIndex = AffixIndex<Pfx>;
392pub(crate) type SuffixIndex = AffixIndex<Sfx>;
393
394/// A data structure for looking up any affixes which might match a given word.
395///
396/// The `AffixIndex` is one of two central data structures, along with the `WordList`. It
397/// functions very similarly to a [radix tree], allowing efficient lookup of prefix or suffix
398/// rules.
399///
400/// For example a prefix from `en_US.aff` for "re":
401///
402/// ```text
403/// PFX A Y 1
404/// PFX A   0     re         .
405/// ```
406///
407/// That prefix strips nothing (`0`) from the beginning and adds "re" to the beginning of any
408/// words it is applied to.
409///
410/// For prefixes, `affixes_of` returns an iterator over all of the `Prefix`es in the table which
411/// have an `add` field which is a prefix of the search word.
412///
413/// This structure also searches from the end of the word when looking up suffixes. A suffix from
414/// `en_US.aff`:
415///
416/// ```text
417/// SFX D Y 4
418/// SFX D   0     d          e
419/// SFX D   y     ied        [^aeiou]y
420/// SFX D   0     ed         [^ey]
421/// SFX D   0     ed         [aeiou]y
422/// ```
423///
424/// Any word in the word list with the "D" flag can try to apply these suffixes. For a word like
425/// "aced," `affixes_of` would return the first, third and fourth suffixes, as `d`, `ed` and `ed`
426/// are suffixes of "aced," but not the second (`ied`).
427///
428/// Internally this type is implemented using a sorted `Vec` of affixes - one table for prefixes
429/// and one for suffixes. Iterating with `affixes_of` first emits all affixes with empty `add`
430/// text. Then we look at the first character in the search string. We can constrain our search
431/// to only the elements in the table that start with that character using a precomputed index
432/// of characters to indices within the table. After considering the first character, we use
433/// linear searches of the remaining table slice to constrain the search for each next character
434/// in the search key.
435///
436/// [radix tree]: https://en.wikipedia.org/wiki/Radix_tree
437// TODO: I originally tried a hashing-based approach using `hashbrown::raw::RawTable`. Lift that
438// structure from the commit history and benchmark it against this Vec based approach.
439#[derive(Debug, Clone)]
440pub(crate) struct AffixIndex<C> {
441    table: Box<[Affix<C>]>,
442    first_char: Box<[char]>,
443    prefix_idx_with_first_char: Box<[usize]>,
444    pub all_flags: FlagSet,
445}
446
447impl<C: AffixKind> FromIterator<Affix<C>> for AffixIndex<C> {
448    fn from_iter<T: IntoIterator<Item = Affix<C>>>(iter: T) -> Self {
449        let table: Vec<_> = iter.into_iter().collect();
450        table.into()
451    }
452}
453
454impl<C: AffixKind> From<Vec<Affix<C>>> for AffixIndex<C> {
455    fn from(mut table: Vec<Affix<C>>) -> Self {
456        // Sort the table lexiographically by key. We will use this lexiographical ordering to
457        // efficiently search in AffixesIter.
458        table.sort_unstable_by(|a, b| a.appending().cmp(b.appending()));
459
460        let mut first_char = Vec::new();
461        let mut prefix_idx_with_first_char = Vec::new();
462
463        // Seek through the sorted table to the first element where the key is non-empty.
464        let mut first_char_idx = table.partition_point(|affix| affix.appending().next().is_none());
465        while first_char_idx < table.len() {
466            let ch = table[first_char_idx]
467                .appending()
468                .next()
469                .expect("vec is sorted so empty keys are before the partition point");
470
471            // Save the first character of the key and the index of the affix in the table that
472            // starts off this character. We can use this while reading the AffixIndex to jump
473            // ahead efficiently in the table.
474            first_char.push(ch);
475            prefix_idx_with_first_char.push(first_char_idx);
476
477            match table[first_char_idx..].iter().position(|affix| {
478                affix
479                    .appending()
480                    .next()
481                    .expect("vec is sorted so empty keys are before the partition point")
482                    > ch
483            }) {
484                Some(next_char_index) => first_char_idx += next_char_index,
485                None => break,
486            }
487        }
488        // Add an extra element to the end so that `prefix_idx_with_first_char` is always one
489        // element longer than `first_char`.
490        prefix_idx_with_first_char.push(table.len());
491
492        let flags = table
493            .iter()
494            .flat_map(|affix| affix.flags.iter().copied())
495            .collect::<Vec<Flag>>()
496            .into();
497
498        Self {
499            table: table.into(),
500            first_char: first_char.into(),
501            prefix_idx_with_first_char: prefix_idx_with_first_char.into(),
502            all_flags: flags,
503        }
504    }
505}
506
507impl<C: AffixKind> AffixIndex<C> {
508    pub fn affixes_of<'index, 'word>(
509        &'index self,
510        word: &'word str,
511    ) -> AffixesIter<'index, 'word, C> {
512        AffixesIter {
513            table: &self.table,
514            first_char: &self.first_char,
515            prefix_idx_with_first_char: &self.prefix_idx_with_first_char,
516            chars: C::chars(word),
517            chars_matched: 0,
518        }
519    }
520
521    pub fn len(&self) -> usize {
522        self.table.len()
523    }
524
525    pub fn iter(&self) -> core::slice::Iter<Affix<C>> {
526        self.table.iter()
527    }
528}
529
530/// An iterator over the prefixes/suffixes of a given word.
531pub(crate) struct AffixesIter<'index, 'word, C: AffixKind + 'static> {
532    table: &'index [Affix<C>],
533    first_char: &'index [char],
534    prefix_idx_with_first_char: &'index [usize],
535    chars: C::Chars<'word>,
536    chars_matched: usize,
537}
538
539impl<'index, C: AffixKind> Iterator for AffixesIter<'index, '_, C> {
540    type Item = &'index Affix<C>;
541
542    fn next(&mut self) -> Option<Self::Item> {
543        // TODO: revisit this type. I suspect it's faster to just use `str::starts_with` and
544        // `str::ends_with` rather than char iterators. Also using char iterators shouldn't
545        // be necessary I think - we could switch to bytes instead.
546
547        // Return all affixes that append nothing first.
548        if self.chars_matched == 0 {
549            if self.table.is_empty() {
550                return None;
551            }
552
553            let item = &self.table[0];
554            if item.appending().next().is_some() {
555                // The empty portion of the table is done.
556                // Scan ahead to where the first character is.
557                let ch = self.chars.next()?;
558                let first_char_idx = self.first_char.iter().position(|c| *c == ch)?;
559
560                // NOTE: `prefix_idx_with_first_char` always has at least one element and is
561                // always one element longer than `first_char`, so we can safely index at `0`
562                // and at whatever index we get from `first_char` plus one.
563                let empty_offset = self.prefix_idx_with_first_char[0];
564                // Constrain the bounds of the search to affixes that share the first letter
565                // of the key. Offset by the number of affixes with empty `add` that we emitted
566                // previously.
567                let start = self.prefix_idx_with_first_char[first_char_idx] - empty_offset;
568                let end = self.prefix_idx_with_first_char[first_char_idx + 1] - empty_offset;
569                self.table = &self.table[start..end];
570                self.chars_matched = 1;
571            } else {
572                self.table = &self.table[1..];
573                return Some(item);
574            }
575        }
576
577        loop {
578            if self.table.is_empty() {
579                return None;
580            }
581
582            // If the search key is exactly matched so far (up to the number of characters we've
583            // seen), emit the item.
584            let item = &self.table[0];
585            if item.appending().count() == self.chars_matched {
586                self.table = &self.table[1..];
587                return Some(item);
588            }
589
590            // Look at the next character in the search key. Limit the search to the slice of
591            // the table where the nth character for each affix matches this character of the
592            // search key.
593            let ch = self.chars.next()?;
594
595            // Move `start` up to the index of the first affix that has this character in its
596            // nth position.
597            let char_beginning_idx = self
598                .table
599                .iter()
600                .position(|affix| affix.appending().nth(self.chars_matched) == Some(ch))?;
601            self.table = &self.table[char_beginning_idx..];
602
603            // Move the `end` back so that the last element in the search slice is the last
604            // affix that shares this character in its nth position.
605            let char_end_idx = self
606                .table
607                .partition_point(|affix| affix.appending().nth(self.chars_matched) == Some(ch));
608            self.table = &self.table[..char_end_idx];
609
610            self.chars_matched += 1;
611        }
612    }
613}
614
615/// A collection of patterns used to break words into smaller words.
616///
617/// This is internally represented with a single `table` which is partitioned into three sections:
618/// one for patterns that apply at the beginning of words, one for patterns that can apply
619/// anywhere in the middle of a word, and one for patterns that must apply to the end of a word.
620///
621// TODO: document how breaks are used and what the patterns mean.
622#[derive(Debug, Clone)]
623pub(crate) struct BreakTable {
624    table: Box<[Box<str>]>,
625    start_word_breaks_last_idx: usize,
626    // Nuspell keeps the entries partitioned in the order "start, end, middle." I've re-arranged
627    // this to be "start, middle, end" since I think it's more natural.
628    middle_word_breaks_last_idx: usize,
629}
630
631impl BreakTable {
632    fn new(breaks: &[&str]) -> Self {
633        let mut start = Vec::new();
634        let mut middle = Vec::new();
635        let mut end = Vec::new();
636
637        for &b in breaks.iter() {
638            // Break patterns are parsed in a way that ensures they are non-empty.
639            assert!(!b.is_empty());
640
641            if let Some(b) = b.strip_prefix('^') {
642                start.push(b.into());
643            } else if let Some(b) = b.strip_suffix('$') {
644                end.push(b.into());
645            } else {
646                middle.push(b.into());
647            }
648        }
649
650        let mut table = start;
651        let start_word_breaks_last_idx = table.len();
652        table.append(&mut middle);
653        let middle_word_breaks_last_idx = table.len();
654        table.append(&mut end);
655
656        Self {
657            table: table.into_boxed_slice(),
658            start_word_breaks_last_idx,
659            middle_word_breaks_last_idx,
660        }
661    }
662
663    #[inline]
664    pub fn start_word_breaks(&self) -> impl Iterator<Item = &str> {
665        self.table[..self.start_word_breaks_last_idx]
666            .iter()
667            .map(AsRef::as_ref)
668    }
669
670    #[inline]
671    pub fn middle_word_breaks(&self) -> impl Iterator<Item = &str> {
672        self.table[self.start_word_breaks_last_idx..self.middle_word_breaks_last_idx]
673            .iter()
674            .map(AsRef::as_ref)
675    }
676
677    #[inline]
678    pub fn end_word_breaks(&self) -> impl Iterator<Item = &str> {
679        self.table[self.middle_word_breaks_last_idx..]
680            .iter()
681            .map(AsRef::as_ref)
682    }
683}
684
685/// An ordered sequence of replacement pairs.
686///
687/// This is very similar to break patterns both in terms of the language used to describe them
688/// in aff files and in representation. It's an ordered sequence with the replacements that only
689/// apply:
690///
691/// 1. To entire words.
692/// 2. At the beginning of words.
693/// 3. At the end of words.
694/// 4. Anywhere in the word.
695///
696/// Otherwise though it's basically a `Vec<(String, String)>`.
697#[derive(Debug, Clone)]
698pub(crate) struct ReplacementTable {
699    table: Box<[(Box<str>, Box<str>)]>,
700    whole_word_replacements_last_idx: usize,
701    start_word_replacements_last_idx: usize,
702    end_word_replacements_last_idx: usize,
703}
704
705impl From<Vec<(&str, String)>> for ReplacementTable {
706    fn from(replacements: Vec<(&str, String)>) -> Self {
707        let mut whole = Vec::new();
708        let mut start = Vec::new();
709        let mut end = Vec::new();
710        let mut anywhere = Vec::new();
711
712        for (from, to) in replacements.into_iter() {
713            // Replacements are parsed in a way that ensures they are non-empty.
714            assert!(!from.is_empty() && !to.is_empty());
715
716            if let Some(from) = from.strip_prefix('^') {
717                if let Some(from) = from.strip_suffix('$') {
718                    // Starts with '^' and ends with '$' matches the whole word.
719                    // Seems to be rarely if ever used in practice.
720                    whole.push((from.into(), to.into()));
721                } else {
722                    // Only starts with '^' - applies to the start of a word.
723                    start.push((from.into(), to.into()));
724                }
725            } else if let Some(from) = from.strip_suffix('$') {
726                // Only ends with '$' - applies to the end of a word.
727                end.push((from.into(), to.into()));
728            } else {
729                // Doesn't have an anchor - applies anywhere in the word.
730                anywhere.push((from.into(), to.into()));
731            }
732        }
733
734        let mut table = whole;
735        let whole_word_replacements_last_idx = table.len();
736        table.append(&mut start);
737        let start_word_replacements_last_idx = table.len();
738        table.append(&mut end);
739        let end_word_replacements_last_idx = table.len();
740        table.append(&mut anywhere);
741
742        Self {
743            table: table.into_boxed_slice(),
744            whole_word_replacements_last_idx,
745            start_word_replacements_last_idx,
746            end_word_replacements_last_idx,
747        }
748    }
749}
750
751impl ReplacementTable {
752    #[inline]
753    pub fn whole_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
754        self.table[..self.whole_word_replacements_last_idx]
755            .iter()
756            .map(|(from, to)| (from.as_ref(), to.as_ref()))
757    }
758
759    #[inline]
760    pub fn start_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
761        self.table[self.whole_word_replacements_last_idx..self.start_word_replacements_last_idx]
762            .iter()
763            .map(|(from, to)| (from.as_ref(), to.as_ref()))
764    }
765
766    #[inline]
767    pub fn end_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
768        self.table[self.start_word_replacements_last_idx..self.end_word_replacements_last_idx]
769            .iter()
770            .map(|(from, to)| (from.as_ref(), to.as_ref()))
771    }
772
773    #[inline]
774    pub fn any_place_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
775        self.table[self.end_word_replacements_last_idx..]
776            .iter()
777            .map(|(from, to)| (from.as_ref(), to.as_ref()))
778    }
779
780    /// Whether the table only has whole word replacements.
781    #[inline]
782    pub fn has_only_whole_word_replacements(&self) -> bool {
783        self.whole_word_replacements_last_idx == self.table.len()
784    }
785}
786
787/// Individual rules of COMPOUNDRULE patterns.
788///
789/// Compound rules are a very small regex-like language for describing how stems might be joined
790/// in a compound. Each element might be a flag, a zero-or-one wildcard (`?`) or a zero-or-more
791/// wildcard (`*`). Typically dictionaries use these to describe how to compound numbers
792/// together. The [Spylls docs for `CompoundRule`](https://spylls.readthedocs.io/en/latest/hunspell/data_aff.html?highlight=compound%20rule#spylls.hunspell.data.aff.CompoundRule)
793/// have an excellent explanation of a common use-case for compound rules.
794///
795/// # Representation
796///
797/// Nuspell doesn't special case `*` and `?` modifiers. It stores the entire rule as a string
798/// of `char16_t` (which is also Nuspell flag type). This is quite clever because it allows
799/// Nuspell to only spend two bytes per element to store the rule. `CompoundRuleElement` is
800/// 4 bytes in comparison. The tradeoff is ambiguity for some `FlagType` representations and more
801/// complicated matching code. If a `.aff` file used `FlagType::Numeric`, `*` would be
802/// indistinguishable from 42 and `?` indistinguishable from 63. In practice this doesn't seem to
803/// be a problem. Nuspell looks ahead in the rule string to find wildcards when matching which is
804/// not much more work but is more complicated to understand.
805///
806/// We use a `[CompoundRuleElement]` in Spellbook only for clarity. Few dictionaries use
807/// compound rules and those that do use them tend to use 12 or fewer entries in the table, with
808/// each rule being only a few elements long.
809#[derive(Debug, Clone, PartialEq, Eq)]
810pub(crate) struct CompoundRuleElement {
811    pub flag: Flag,
812    pub modifier: Option<CompoundRuleModifier>,
813}
814
815#[derive(Debug, Clone, Copy, PartialEq, Eq)]
816pub(crate) enum CompoundRuleModifier {
817    ZeroOrOne,
818    ZeroOrMore,
819}
820
821type CompoundRule = Box<[CompoundRuleElement]>;
822
823fn compound_rule_matches(pattern: &[CompoundRuleElement], data: &[&FlagSet]) -> bool {
824    use crate::alloc::vec;
825    use CompoundRuleModifier::*;
826
827    // TODO: try reimplementing with recursion instead of a Vec-based stack?
828    let mut stack = vec![(0, 0)];
829
830    while let Some((pattern_idx, data_idx)) = stack.pop() {
831        if pattern_idx == pattern.len() {
832            return data_idx == data.len();
833        }
834
835        // Nuspell does this a little differently. As mentioned in the `CompoundRuleElement` docs
836        // they don't preprocess the rules so the wildcards remain in the strings as `*`/`?`
837        // characters. It looks ahead (`pattern_idx + 1`) to find the modifier and when it skips
838        // over the modifier it jumps ahead by 2. We jump ahead by one below and use the modifier
839        // we parsed instead.
840
841        let flag_matches = match data.get(data_idx) {
842            Some(flagset) => flagset.contains(&pattern[pattern_idx].flag),
843            None => false,
844        };
845        match pattern[pattern_idx].modifier {
846            Some(ZeroOrOne) => {
847                // Try as if the flag didn't match (allowed since the pattern may match 0 times).
848                stack.push((pattern_idx + 1, data_idx));
849                if flag_matches {
850                    // Try as if it did match: consume the pattern token and move on.
851                    stack.push((pattern_idx + 1, data_idx + 1));
852                }
853            }
854            Some(ZeroOrMore) => {
855                // Try as if the flag didn't match (allowed since the pattern may match 0 times).
856                stack.push((pattern_idx + 1, data_idx));
857                if flag_matches {
858                    // Try as if it did match. Don't consume the pattern token because it can
859                    // match infinitely.
860                    stack.push((pattern_idx, data_idx + 1));
861                }
862            }
863            None => {
864                // Without a modifier, only continue searching if the flag matched.
865                if flag_matches {
866                    stack.push((pattern_idx + 1, data_idx + 1));
867                }
868            }
869        }
870    }
871
872    false
873}
874
875/// A set of rules that can be used to detect whether constructed compounds are allowed.
876///
877/// TODO: talk about wildcards, show a compounding example.
878#[derive(Debug, Clone)]
879pub(crate) struct CompoundRuleTable {
880    rules: Box<[CompoundRule]>,
881    all_flags: FlagSet,
882}
883
884impl From<Vec<CompoundRule>> for CompoundRuleTable {
885    fn from(rules: Vec<CompoundRule>) -> Self {
886        let all_flags: Vec<_> = rules
887            .iter()
888            .flat_map(|rule| rule.iter().map(|el| el.flag))
889            .collect();
890
891        Self {
892            rules: rules.into_boxed_slice(),
893            all_flags: all_flags.into(),
894        }
895    }
896}
897
898impl CompoundRuleTable {
899    #[inline]
900    pub fn is_empty(&self) -> bool {
901        self.rules.is_empty()
902    }
903
904    /// Checks whether the given flagset has any flags in common with flags used in any compound
905    /// rule.
906    #[inline]
907    pub fn has_any_flags(&self, flagset: &FlagSet) -> bool {
908        self.all_flags.has_intersection(flagset)
909    }
910
911    /// Checks whether any rule in the compound table matches the sequence of flagsets.
912    ///
913    /// Also see Checker's `check_compound_with_rules_impl`. This function is passed a possible
914    /// interpretation of a compound. Consider "10th" and the en_US dictionary. The checker might
915    /// split up the word to be "1/n1" (L4) and "0th/pt" (L3). So this function would be passed
916    /// `&[flagset!['n', '1'], flagset!['p', 't']]`. The matching logic in compound_rule_matches
917    /// uses `FlagSet::contains` on these flagsets to look up whether the flagset matches the
918    /// pattern. en_US defines the compounding for this as `n*1t` - `n` being a flag on any single
919    /// digit, `*` meaning as many of those as you want (including none) and then a required `1`
920    /// flag and `t` flag - which match the input. compound_rule_matches tries different
921    /// interpretations of the modifiers to check if the pattern matches.
922    pub fn any_rule_matches(&self, flagsets: &[&FlagSet]) -> bool {
923        self.rules
924            .iter()
925            .any(|rule| compound_rule_matches(rule, flagsets))
926    }
927}
928
929/// Storage for two strings within the allocation of one `Box<str>`.
930#[derive(Debug, Clone)]
931pub(crate) struct StrPair {
932    inner: Box<str>,
933    /// The `.len()` of the first string: the index of the partition between the first and
934    /// second string.
935    partition: usize,
936}
937
938impl StrPair {
939    pub fn new(left: &str, right: &str) -> Self {
940        let mut string = left.to_string();
941        let partition = string.len();
942        string.push_str(right);
943
944        Self {
945            inner: string.into(),
946            partition,
947        }
948    }
949
950    #[inline]
951    pub fn full_str(&self) -> &str {
952        &self.inner
953    }
954
955    #[inline]
956    pub fn partition(&self) -> usize {
957        self.partition
958    }
959}
960
961#[derive(Debug, Clone)]
962pub(crate) struct CompoundPattern {
963    pub begin_end_chars: StrPair,
964    // TODO: unimplemented. See Checker::check_compound_with_pattern_replacements.
965    #[allow(dead_code)]
966    pub replacement: Option<Box<str>>,
967    pub first_word_flag: Option<Flag>,
968    pub second_word_flag: Option<Flag>,
969    pub match_first_only_unaffixed_or_zero_affixed: bool,
970}
971
972#[derive(Debug, Clone)]
973struct Conversion {
974    from: Box<str>,
975    to: Box<str>,
976    /// ICONV/OCONV use '_' at the end of the pattern text like '$' in regex: if the match only
977    /// applies at the end of the word.
978    anchor_end: bool,
979}
980
981impl Conversion {
982    fn find(&self, word: &str) -> Option<usize> {
983        if word.len() < self.from.len() {
984            return None;
985        }
986
987        if self.anchor_end {
988            word.ends_with(&*self.from)
989                .then_some(word.len() - self.from.len())
990        } else {
991            word.find(&*self.from)
992        }
993    }
994}
995
996/// The conversion table used by ICONV and OCONV rules.
997///
998/// This is nothing more than a sequence of `(from, to)` replacement pairs. Not many dictionaries
999/// use this rule. en_US and a few others use it to replace magic apostrophes "’" with regular
1000/// ones. Others like french have quite a few rules to normalize similar looking and meaning
1001/// unicode representations of letters, like "à" becoming "à".
1002#[derive(Debug, Clone)]
1003pub(crate) struct ConversionTable {
1004    inner: Box<[Conversion]>,
1005}
1006
1007impl From<Vec<(&str, &str)>> for ConversionTable {
1008    fn from(mut table: Vec<(&str, &str)>) -> Self {
1009        // Sort the table so that longer patterns come before shorter.
1010        table.sort_unstable_by_key(|(from, _to)| core::cmp::Reverse(*from));
1011
1012        Self {
1013            inner: table
1014                .into_iter()
1015                .map(|(from, to)| {
1016                    let (from, anchor_end) = if let Some(from) = from.strip_suffix('_') {
1017                        (from, true)
1018                    } else {
1019                        (from, false)
1020                    };
1021                    Conversion {
1022                        to: to.into(),
1023                        from: from.into(),
1024                        anchor_end,
1025                    }
1026                })
1027                .collect(),
1028        }
1029    }
1030}
1031
1032/// A match of an ICONV/OCONV pattern in a word.
1033/// The lifetime belongs to the table.
1034#[derive(Debug)]
1035struct ConversionMatch<'a> {
1036    /// The byte index in the word at which the match starts.
1037    start: usize,
1038    /// The pattern which matched.
1039    from: &'a str,
1040    /// The text that should be used to replace the pattern.
1041    replacement: &'a str,
1042}
1043
1044impl ConversionTable {
1045    /// Finds the earliest conversion which matches the given word at the offset with the longest
1046    /// replacement length.
1047    fn find_match<'a>(&'a self, word: &str, offset: usize) -> Option<ConversionMatch<'a>> {
1048        let mut mat: Option<ConversionMatch> = None;
1049
1050        for conversion in self.inner.iter() {
1051            let start = match conversion.find(&word[offset..]) {
1052                Some(idx) => idx + offset,
1053                None => continue,
1054            };
1055
1056            // Favor the longest pattern which starts at the earliest byte.
1057            if mat.is_none()
1058                || mat.as_ref().is_some_and(|mat| {
1059                    start < mat.start
1060                        || (start == mat.start && conversion.from.len() > mat.from.len())
1061                })
1062            {
1063                mat = Some(ConversionMatch {
1064                    start,
1065                    from: &conversion.from,
1066                    replacement: &conversion.to,
1067                })
1068            }
1069        }
1070
1071        mat
1072    }
1073
1074    /// Applies any ICONV/OCONV conversions to the given word.
1075    ///
1076    /// Longer patterns are replaced before shorter ones.
1077    pub fn convert<'a>(&self, word: &'a str) -> Cow<'a, str> {
1078        let mut word = Cow::Borrowed(word);
1079
1080        let mut i = 0;
1081        while let Some(conversion) = self.find_match(&word, i) {
1082            let mut string = word.into_owned();
1083            // Adjust for finding a match within `&word[i..]`
1084            let range = conversion.start..conversion.start + conversion.from.len();
1085            string.replace_range(range, conversion.replacement);
1086            word = Cow::Owned(string);
1087            i = conversion.start + conversion.replacement.len();
1088        }
1089
1090        word
1091    }
1092}
1093
1094#[derive(Debug, Default, Clone, Copy)]
1095pub(crate) enum CaseHandling {
1096    /// Turkish, Azerbaijani and Crimean Tartar. These locales require some extra replacement
1097    /// logic for i-like characters.
1098    Turkic,
1099    /// Anything else. These locales can use the standard library's case mapping functions.
1100    #[default]
1101    Standard,
1102}
1103
1104impl CaseHandling {
1105    // Casing primitives.
1106    // The standard library covers us most of the way but there's a locale-specific casing rule
1107    // that applies to Turkic locales: Turkish, Azerbaijani and Crimean Tartar. In Turkic
1108    // locales, "i" is uppercased as "İ" and "I" is lowercased as "ı".
1109    // TODO: optimize. See the standard library's optimizations.
1110
1111    pub fn lowercase(&self, word: &str) -> String {
1112        match self {
1113            Self::Turkic => word.replace('I', "ı").replace('İ', "i").to_lowercase(),
1114            Self::Standard => word.to_lowercase(),
1115        }
1116    }
1117
1118    pub fn uppercase(&self, word: &str) -> String {
1119        match self {
1120            Self::Turkic => word.replace('i', "İ").replace('ı', "I").to_uppercase(),
1121            Self::Standard => word.to_uppercase(),
1122        }
1123    }
1124
1125    pub fn titlecase(&self, word: &str) -> String {
1126        let mut output = String::with_capacity(word.len());
1127        let mut chars = word.chars();
1128        let first = chars.next().expect("non-empty input");
1129        match self {
1130            Self::Turkic if first == 'i' => output.push('İ'),
1131            Self::Turkic if first == 'ı' => output.push('I'),
1132            _ => output.extend(first.to_uppercase()),
1133        }
1134        for ch in chars {
1135            match self {
1136                Self::Turkic if ch == 'I' => output.push('ı'),
1137                Self::Turkic if ch == 'İ' => output.push('i'),
1138                _ => output.extend(ch.to_lowercase()),
1139            }
1140        }
1141        output
1142    }
1143
1144    pub fn lower_first_char(&self, word: &str) -> String {
1145        let mut output = String::with_capacity(word.len());
1146        let mut chars = word.char_indices();
1147        let (_, first) = chars.next().expect("non-empty input");
1148        match self {
1149            Self::Turkic if first == 'I' => output.push('ı'),
1150            Self::Turkic if first == 'İ' => output.push('i'),
1151            _ => output.extend(first.to_lowercase()),
1152        }
1153        if let Some((idx, _)) = chars.next() {
1154            output.push_str(&word[idx..]);
1155        }
1156        output
1157    }
1158
1159    pub fn upper_char_at(&self, word: &str, idx: usize) -> String {
1160        let mut output = String::with_capacity(word.len());
1161        output.push_str(&word[..idx]);
1162        let mut chars = word[idx..].char_indices();
1163        let (_, ch) = chars.next().expect("a char at the given index");
1164        match self {
1165            Self::Turkic if ch == 'ı' => output.push('I'),
1166            Self::Turkic if ch == 'i' => output.push('İ'),
1167            _ => output.extend(ch.to_uppercase()),
1168        }
1169        if let Some((next_idx, _)) = chars.next() {
1170            output.push_str(&word[idx + next_idx..]);
1171        }
1172        output
1173    }
1174
1175    /// Checks whether `left` is equal to `right` when `right` is lowercase.
1176    pub fn is_char_eq_lowercase(&self, left: char, right: char) -> bool {
1177        match (self, left, right) {
1178            (Self::Turkic, 'ı', 'I') => return true,
1179            (Self::Turkic, 'i', 'İ') => return true,
1180            _ => (),
1181        }
1182
1183        let mut lower_iter = right.to_lowercase();
1184        if lower_iter.len() != 1 {
1185            return false;
1186        }
1187        let lower = lower_iter.next().unwrap();
1188        left == lower
1189    }
1190}
1191
1192#[derive(Debug, Clone)]
1193pub(crate) struct SimilarityGroup {
1194    pub chars: Box<str>,
1195    pub strings: Box<[Box<str>]>,
1196}
1197
1198#[derive(Debug, Clone)]
1199pub(crate) struct AffData {
1200    // checking options
1201    pub prefixes: PrefixIndex,
1202    pub suffixes: SuffixIndex,
1203    pub break_table: BreakTable,
1204    pub compound_rules: CompoundRuleTable,
1205    pub compound_syllable_vowels: Box<str>,
1206    pub compound_patterns: Box<[CompoundPattern]>,
1207    pub input_conversions: ConversionTable,
1208    pub output_conversions: ConversionTable,
1209    // locale TODO
1210    // suggestion options
1211    pub replacements: ReplacementTable,
1212    pub similarities: Box<[SimilarityGroup]>,
1213    // phonetic_table: PhoneticTable,
1214    pub ignore_chars: Box<[char]>,
1215    pub keyboard_closeness: Box<str>,
1216    pub try_chars: Box<str>,
1217    pub options: AffOptions,
1218    // Parsing options. These are preserved so that we can re-use them in `Dictionary::add`.
1219    pub flag_type: FlagType,
1220    pub flag_aliases: Box<[FlagSet]>,
1221}
1222
1223#[derive(Debug, Clone, Copy)]
1224pub(crate) struct AffOptions {
1225    pub complex_prefixes: bool,
1226    pub fullstrip: bool,
1227    pub checksharps: bool,
1228    pub forbid_warn: bool,
1229    pub only_in_compound_flag: Option<Flag>,
1230    pub circumfix_flag: Option<Flag>,
1231    pub forbidden_word_flag: Option<Flag>,
1232    pub keep_case_flag: Option<Flag>,
1233    pub need_affix_flag: Option<Flag>,
1234    pub warn_flag: Option<Flag>,
1235    // compounding options
1236    pub compound_flag: Option<Flag>,
1237    pub compound_begin_flag: Option<Flag>,
1238    pub compound_middle_flag: Option<Flag>,
1239    pub compound_end_flag: Option<Flag>,
1240    // These `Option<NonZeroU16>`s represent counts or sizes and a zero value isn't accepted.
1241    // Being the same as a flag's representation is coincidence.
1242    pub compound_min_length: Option<NonZeroU16>,
1243    pub compound_max_word_count: Option<NonZeroU16>,
1244    pub compound_permit_flag: Option<Flag>,
1245    pub compound_forbid_flag: Option<Flag>,
1246    pub compound_root_flag: Option<Flag>,
1247    pub compound_force_uppercase_flag: Option<Flag>,
1248    pub compound_more_suffixes: bool,
1249    pub compound_check_duplicate: bool,
1250    pub compound_check_rep: bool,
1251    pub compound_check_case: bool,
1252    pub compound_check_triple: bool,
1253    pub compound_simplified_triple: bool,
1254    pub compound_syllable_num: bool,
1255    pub compound_syllable_max: Option<NonZeroU16>,
1256    pub max_compound_suggestions: u16,
1257    pub no_suggest_flag: Option<Flag>,
1258    pub substandard_flag: Option<Flag>,
1259    pub max_ngram_suggestions: u16,
1260    pub max_diff_factor: u16,
1261    pub only_max_diff: bool,
1262    pub no_split_suggestions: bool,
1263    pub suggest_with_dots: bool,
1264    pub case_handling: CaseHandling,
1265}
1266
1267impl Default for AffOptions {
1268    fn default() -> Self {
1269        Self {
1270            complex_prefixes: Default::default(),
1271            fullstrip: Default::default(),
1272            checksharps: Default::default(),
1273            forbid_warn: Default::default(),
1274            only_in_compound_flag: Default::default(),
1275            circumfix_flag: Default::default(),
1276            forbidden_word_flag: Default::default(),
1277            keep_case_flag: Default::default(),
1278            need_affix_flag: Default::default(),
1279            warn_flag: Default::default(),
1280            compound_flag: Default::default(),
1281            compound_begin_flag: Default::default(),
1282            compound_middle_flag: Default::default(),
1283            compound_end_flag: Default::default(),
1284            compound_min_length: Default::default(),
1285            compound_max_word_count: Default::default(),
1286            compound_permit_flag: Default::default(),
1287            compound_forbid_flag: Default::default(),
1288            compound_root_flag: Default::default(),
1289            compound_force_uppercase_flag: Default::default(),
1290            compound_more_suffixes: Default::default(),
1291            compound_check_duplicate: Default::default(),
1292            compound_check_rep: Default::default(),
1293            compound_check_case: Default::default(),
1294            compound_check_triple: Default::default(),
1295            compound_simplified_triple: Default::default(),
1296            compound_syllable_num: Default::default(),
1297            compound_syllable_max: Default::default(),
1298            max_compound_suggestions: 3,
1299            no_suggest_flag: Default::default(),
1300            substandard_flag: Default::default(),
1301            max_ngram_suggestions: 5,
1302            max_diff_factor: 5,
1303            only_max_diff: Default::default(),
1304            no_split_suggestions: Default::default(),
1305            suggest_with_dots: Default::default(),
1306            case_handling: Default::default(),
1307        }
1308    }
1309}
1310
1311impl AffOptions {
1312    pub fn allows_compounding(&self) -> bool {
1313        self.compound_flag.is_some()
1314            || self.compound_begin_flag.is_some()
1315            || self.compound_middle_flag.is_some()
1316            || self.compound_end_flag.is_some()
1317    }
1318}
1319
1320#[cfg(test)]
1321mod test {
1322    use super::*;
1323
1324    macro_rules! flag {
1325        ( $x:expr ) => {{
1326            Flag::new($x as u16).unwrap()
1327        }};
1328    }
1329    macro_rules! flagset {
1330        () => {{
1331            FlagSet::default()
1332        }};
1333        ( $( $x:expr ),* ) => {
1334            {
1335                FlagSet::from( $crate::alloc::vec![ $( Flag::new( $x as u16 ).unwrap() ),* ] )
1336            }
1337        };
1338    }
1339
1340    #[test]
1341    fn condition_matches() {
1342        // No special characters
1343        assert!("foo".parse::<Condition>().unwrap().matches("foo"));
1344
1345        // Fast lane: the input is shorter (bytes) than the number of characters in the pattern.
1346        assert!(!"foo".parse::<Condition>().unwrap().matches("fo"));
1347
1348        // Positive character class
1349        let condition = "xx[abc]x".parse::<Condition>().unwrap();
1350        assert!(condition.matches("xxax"));
1351        assert!(condition.matches("xxbx"));
1352        assert!(condition.matches("xxcx"));
1353        assert!(!condition.matches("xxdx"));
1354
1355        // Negative character class
1356        let condition = "xx[^abc]x".parse::<Condition>().unwrap();
1357        assert!(!condition.matches("xxax"));
1358        assert!(!condition.matches("xxbx"));
1359        assert!(!condition.matches("xxcx"));
1360        assert!(condition.matches("xxdx"));
1361    }
1362
1363    #[test]
1364    fn condition_nuspell_unit_test() {
1365        // Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L167-L299>
1366        // Our structure for Condition is different so we only port the prefix tests.
1367        let cond = "abcd".parse::<Condition>().unwrap();
1368        assert!(cond.matches("abcd"));
1369        assert!(cond.matches("abcdXYZ"));
1370        assert!(cond.matches("abcdБВГДШ\u{ABCD}\u{10ABCD}"));
1371        assert!(!cond.matches(""));
1372        assert!(!cond.matches("abc"));
1373        assert!(!cond.matches("abcX"));
1374        assert!(!cond.matches("XYZ"));
1375        assert!(!cond.matches("АаБбВвГгШш\u{ABCD}\u{10ABCD}"));
1376
1377        let cond = "[vbn]".parse::<Condition>().unwrap();
1378        assert!(cond.matches("v"));
1379        assert!(cond.matches("vAAш"));
1380        assert!(cond.matches("b"));
1381        assert!(cond.matches("bBBш"));
1382        assert!(cond.matches("n"));
1383        assert!(cond.matches("nCCш"));
1384        assert!(!cond.matches(""));
1385        assert!(!cond.matches("Q"));
1386        assert!(!cond.matches("Qqqq"));
1387        assert!(!cond.matches("1342234"));
1388        assert!(!cond.matches("V"));
1389        assert!(!cond.matches("бвгдш"));
1390
1391        let cond = "[бш\u{1234}]".parse::<Condition>().unwrap();
1392        assert!(cond.matches("б"));
1393        assert!(cond.matches("бeT"));
1394        assert!(cond.matches("ш"));
1395        assert!(cond.matches("шок"));
1396        assert!(cond.matches("\u{1234}кош"));
1397        assert!(!cond.matches(""));
1398        assert!(!cond.matches("Q"));
1399        assert!(!cond.matches("Qqqq"));
1400        assert!(!cond.matches("пан"));
1401        assert!(!cond.matches("\u{ABCD}\u{1234}"));
1402        assert!(!cond.matches("вбгдш"));
1403
1404        let cond = "[^zш\u{1234}\u{10ABCD}]".parse::<Condition>().unwrap();
1405        assert!(!cond.matches("z"));
1406        assert!(!cond.matches("ш"));
1407        assert!(!cond.matches("\u{1234}"));
1408        assert!(!cond.matches("\u{10ABCD}"));
1409        assert!(!cond.matches("zљње"));
1410        assert!(!cond.matches("шabc"));
1411        assert!(!cond.matches("\u{1234} ytyty"));
1412        assert!(!cond.matches("\u{10ABCD} tytyty"));
1413        assert!(cond.matches("q"));
1414        assert!(cond.matches("r"));
1415        assert!(cond.matches("\u{FFFD}"));
1416        assert!(cond.matches("\u{10FFFF}"));
1417        assert!(cond.matches("qљње"));
1418        assert!(cond.matches("фabc"));
1419        assert!(cond.matches("\u{FFFD} ytyty"));
1420        assert!(cond.matches("\u{10FFFF} tytyty"));
1421
1422        let cond = "abc АБВ..[zбш\u{1234}][^zш\u{1234}\u{10ABCD}]X"
1423            .parse::<Condition>()
1424            .unwrap();
1425        assert!(cond.matches("abc АБВ \u{2345}z\u{011111}X"));
1426        assert!(cond.matches("abc АБВ\u{2345} ш\u{011112}Xопop"));
1427        assert!(!cond.matches("abc ШШШ \u{2345}z\u{011111}X"));
1428        assert!(!cond.matches("abc АБВ\u{2345} t\u{011112}Xопop"));
1429        assert!(!cond.matches("abc АБВ \u{2345}z\u{1234}X"));
1430    }
1431
1432    #[test]
1433    fn string_pair() {
1434        let pair = StrPair::new("foo", "bar");
1435        assert_eq!(pair.full_str(), "foobar");
1436
1437        let pair = StrPair::new("", "");
1438        assert_eq!(pair.full_str(), "")
1439    }
1440
1441    #[test]
1442    fn break_table_nuspell_unit_test() {
1443        // Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L100-L127>
1444        let table = BreakTable::new(&[
1445            "bsd", "zxc", "asd", "^bar", "^zoo", "^abc", "car$", "yoyo$", "air$",
1446        ]);
1447
1448        let mut starts: Vec<_> = table.start_word_breaks().collect();
1449        starts.sort_unstable();
1450        assert_eq!(&["abc", "bar", "zoo"], starts.as_slice());
1451
1452        let mut middles: Vec<_> = table.middle_word_breaks().collect();
1453        middles.sort_unstable();
1454        assert_eq!(&["asd", "bsd", "zxc"], middles.as_slice());
1455
1456        let mut ends: Vec<_> = table.end_word_breaks().collect();
1457        ends.sort_unstable();
1458        assert_eq!(&["air", "car", "yoyo"], ends.as_slice());
1459    }
1460
1461    #[test]
1462    fn prefix_suffix_nuspell_unit_test() {
1463        // Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L301-L313>
1464        let prefix = Prefix::new(flag!('F'), false, Some("qw"), "Qwe", None, flagset![]).unwrap();
1465        assert_eq!(prefix.to_derived("qwrty").as_str(), "Qwerty");
1466        assert_eq!(prefix.to_stem("Qwerty").as_ref(), "qwrty");
1467
1468        let suffix = Suffix::new(flag!('F'), false, Some("ie"), "ying", None, flagset![]).unwrap();
1469        assert_eq!(suffix.to_derived("pie").as_str(), "pying");
1470        assert_eq!(suffix.to_stem("pying").as_ref(), "pie");
1471    }
1472
1473    #[test]
1474    fn empty_affix_index() {
1475        let index: PrefixIndex = [].into_iter().collect();
1476        assert!(index.affixes_of("anything").next().is_none());
1477
1478        let index: SuffixIndex = [].into_iter().collect();
1479        assert!(index.affixes_of("anything").next().is_none());
1480    }
1481
1482    #[test]
1483    fn affix_index_prefix_multiset_nuspell_unit_test() {
1484        // Upstream: <https://github.com/nuspell/nuspell/blob/b37faff6ea630a4a1bfb22097d455224b4239f8e/tests/unit_test.cxx#L315-L329>
1485        fn prefix(add: &str) -> Prefix {
1486            Prefix::new(Flag::new(1).unwrap(), true, None, add, None, flagset![]).unwrap()
1487        }
1488
1489        let index: PrefixIndex = [
1490            "", "a", "", "ab", "abx", "as", "asdf", "axx", "as", "bqwe", "ba", "rqwe",
1491        ]
1492        .into_iter()
1493        .map(prefix)
1494        .collect();
1495
1496        let prefixes: Vec<_> = index
1497            .affixes_of("asdfg")
1498            .map(|prefix| prefix.add.as_ref())
1499            .collect();
1500
1501        assert_eq!(&["", "", "a", "as", "as", "asdf"], prefixes.as_slice());
1502    }
1503
1504    #[test]
1505    fn affix_index_suffix_multiset_nuspell_unit_test() {
1506        // Upstream: <https://github.com/nuspell/nuspell/blob/b37faff6ea630a4a1bfb22097d455224b4239f8e/tests/unit_test.cxx#L331-L345>
1507        fn suffix(add: &str) -> Suffix {
1508            Suffix::new(Flag::new(1).unwrap(), true, None, add, None, flagset![]).unwrap()
1509        }
1510
1511        let index: SuffixIndex = [
1512            "", "", "a", "b", "b", "ab", "ub", "zb", "aub", "uub", "xub", "huub",
1513        ]
1514        .into_iter()
1515        .map(suffix)
1516        .collect();
1517
1518        let suffixes: Vec<_> = index
1519            .affixes_of("ahahuub")
1520            .map(|suffix| suffix.add.as_ref())
1521            .collect();
1522
1523        assert_eq!(
1524            &["", "", "b", "b", "ub", "uub", "huub"],
1525            suffixes.as_slice()
1526        );
1527    }
1528
1529    #[test]
1530    fn affix_index_en_us_suffix_example() {
1531        // This suffix is from `en_US.aff`:
1532        //
1533        // SFX D Y 4
1534        // SFX D   0     d          e
1535        // SFX D   y     ied        [^aeiou]y
1536        // SFX D   0     ed         [^ey]
1537        // SFX D   0     ed         [aeiou]y
1538        let flag = Flag::new('D' as u16).unwrap();
1539        let suffix1 = Suffix::new(flag, true, None, "d", Some("e"), flagset![]).unwrap();
1540        let suffix2 =
1541            Suffix::new(flag, true, Some("y"), "ied", Some("[^aeiou]y"), flagset![]).unwrap();
1542        let suffix3 = Suffix::new(flag, true, None, "ed", Some("[^ey]"), flagset![]).unwrap();
1543        let suffix4 = Suffix::new(flag, true, None, "ed", Some("[aeiou]y"), flagset![]).unwrap();
1544
1545        let index: SuffixIndex = [&suffix1, &suffix2, &suffix3, &suffix4]
1546            .into_iter()
1547            .cloned()
1548            .collect();
1549
1550        // From `en_US.dic`: `ace/DSMG`. The "ace" stem can be turned into "aced" with the above
1551        // suffix rules, specifically the first rule (`suffix1`). However all of these suffixes
1552        // except `suffix2` are returned by `affixes_of`.
1553        let word = "aced";
1554        let affixes: Vec<&Suffix> = index.affixes_of(word).collect();
1555        assert_eq!(&[&suffix1, &suffix3, &suffix4], affixes.as_slice());
1556
1557        // Note: even though the condition can match, we would also need to look up the produced
1558        // stem in the word list to confirm that "aced" is a valid word.
1559
1560        let stem1 = suffix1.to_stem(word);
1561        assert_eq!(&stem1, "ace");
1562        assert!(suffix1.condition_matches(&stem1));
1563
1564        let stem3 = suffix3.to_stem(word);
1565        assert_eq!(&stem3, "ac");
1566        assert!(suffix3.condition_matches(&stem3));
1567
1568        let stem4 = suffix4.to_stem(word);
1569        assert_eq!(&stem4, "ac");
1570        assert!(!suffix4.condition_matches(&stem4));
1571    }
1572
1573    fn compound_rule_matches(pattern: &[CompoundRuleElement], data: &str) -> bool {
1574        let flagsets: Vec<_> = data.chars().map(|ch| flagset!(ch)).collect();
1575        let borrowed: Vec<_> = flagsets.iter().collect();
1576        super::compound_rule_matches(pattern, &borrowed)
1577    }
1578
1579    #[test]
1580    fn compound_rule_matches_literal() {
1581        let rule = parser::parse_compound_rule("abc", FlagType::default()).unwrap();
1582
1583        assert!(compound_rule_matches(&rule, "abc"));
1584
1585        assert!(!compound_rule_matches(&rule, "ac"));
1586        assert!(!compound_rule_matches(&rule, "abcd"));
1587    }
1588
1589    #[test]
1590    fn compound_rule_matches_zero_or_one() {
1591        let rule = parser::parse_compound_rule("ab?c", FlagType::default()).unwrap();
1592
1593        assert!(compound_rule_matches(&rule, "ac"));
1594        assert!(compound_rule_matches(&rule, "abc"));
1595
1596        assert!(!compound_rule_matches(&rule, "ab"));
1597        assert!(!compound_rule_matches(&rule, "bc"));
1598        assert!(!compound_rule_matches(&rule, "abb"));
1599        assert!(!compound_rule_matches(&rule, "abbc"));
1600    }
1601
1602    #[test]
1603    fn compound_rule_matches_zero_or_more() {
1604        let rule = parser::parse_compound_rule("ab*c", FlagType::default()).unwrap();
1605
1606        assert!(compound_rule_matches(&rule, "ac"));
1607        assert!(compound_rule_matches(&rule, "abc"));
1608        assert!(compound_rule_matches(&rule, "abbc"));
1609        assert!(compound_rule_matches(&rule, "abbbc"));
1610        // etc.
1611
1612        assert!(!compound_rule_matches(&rule, "ab"));
1613        assert!(!compound_rule_matches(&rule, "abb"));
1614        assert!(!compound_rule_matches(&rule, "abbcc"));
1615    }
1616
1617    #[test]
1618    fn compound_rule_simple_regex_nuspell_unit_test() {
1619        // Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L384-L393>
1620        let rule = parser::parse_compound_rule("abc?de*ff", FlagType::default()).unwrap();
1621
1622        assert!(compound_rule_matches(&rule, "abdff"));
1623        assert!(compound_rule_matches(&rule, "abcdff"));
1624        assert!(compound_rule_matches(&rule, "abdeeff"));
1625        assert!(compound_rule_matches(&rule, "abcdeff"));
1626
1627        assert!(!compound_rule_matches(&rule, "abcdeeeefff"));
1628        assert!(!compound_rule_matches(&rule, "qwerty"));
1629    }
1630
1631    #[test]
1632    fn casing_conversions_nuspell_unit_test() {
1633        // Upstream: <https://github.com/nuspell/nuspell/blob/6e46eb31708003fa02796ee8dc0c9e57248ba141/tests/unit_test.cxx#L440-L448>
1634        let word = "grüßEN";
1635        assert_eq!(&CaseHandling::default().lowercase(word), "grüßen");
1636        assert_eq!(&CaseHandling::default().uppercase(word), "GRÜSSEN");
1637        assert_eq!(&CaseHandling::default().titlecase(word), "Grüßen");
1638
1639        let word = "isTAnbulI";
1640        assert_eq!(&CaseHandling::default().lowercase(word), "istanbuli");
1641        assert_eq!(&CaseHandling::default().uppercase(word), "ISTANBULI");
1642        assert_eq!(&CaseHandling::default().titlecase(word), "Istanbuli");
1643        assert_eq!(&CaseHandling::Turkic.lowercase(word), "istanbulı");
1644        assert_eq!(&CaseHandling::Turkic.uppercase(word), "İSTANBULI");
1645        assert_eq!(&CaseHandling::Turkic.titlecase(word), "İstanbulı");
1646    }
1647}