spellbook 0.4.0

A spellchecking library compatible with Hunspell dictionaries
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
// The parent module covers suggestions made by editing the input word (for example swapping two
// characters.)  This module instead covers "ngram suggestions" - a fancier and more expensive
// procedure.
//
// The basic idea of ngram suggestion is to find words in the dictionary similar to the input
// word. To do that we try to filter down the wordlist words in a multi step process.
//
// 1. Find 100 stems in the word list with the highest "ngram similarity" score to the input word.
// 2. Expand the prefixes and suffixes for those 100 stems and find the 200 expanded words with
//    the highest ngram similarity score to the input word.
// 3. Re-score the (up to) 200 best candidates based on weighted ngram similarity and other
//    bespoke metrics.
// 4. Push the most promising candidates to the `out` vec.
//
// Note that this is **very** expensive compared to regular edit based suggestions since we need
// to iterate on the word list and expand affixes.
//
// Ngram suggestions are also criticized as not very smart:
// <https://battlepenguin.com/tech/aspell-and-hunspell-a-tale-of-two-spell-checkers/>. Ngram
// suggestions are implemented for compatibility with Nuspell/Hunspell but we could consider
// adding other strategies as well, for example looking at the Aspell code.

// # Implementation note
//
// There's a theme in this module of reusing `Vec<T>` allocations rather than having functions
// return new `Vec<T>`s. This is slightly awkward but is necessary for performance. The
// `ngram_suggest` function in this module has a very hot loop iterating over every stem in the
// word list, so individual allocations add up and the performance boost from reusing allocations
// becomes noticeable.

use core::hash::BuildHasher;
use core::slice;

use crate::alloc::{collections::BinaryHeap, string::String, vec::Vec};

use crate::aff::{CaseHandling, HIDDEN_HOMONYM_FLAG, MAX_SUGGESTIONS};
use crate::{FlagSet, FULL_WORD};

use super::Suggester;

macro_rules! has_flag {
    ( $flags:expr, $flag:expr ) => {{
        match $flag {
            Some(flag) => $flags.contains(&flag),
            None => false,
        }
    }};
}

/// This struct is used as a wrapper for other data (for example stem+flagset) to organize a
/// min-heap with BinaryHeap. (`BinaryHeap` from the standard library is a max-heap and you need
/// to reverse the Ord of the type to use it as a min-heap.)
#[derive(Debug, PartialEq, Eq)]
struct MinScored<T: PartialEq + Eq> {
    score: isize,
    inner: T,
}

impl<T: PartialEq + Eq> Ord for MinScored<T> {
    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
        self.score.cmp(&other.score).reverse()
    }
}

impl<T: PartialEq + Eq> PartialOrd<Self> for MinScored<T> {
    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl<S: BuildHasher> Suggester<'_, S> {
    pub(super) fn ngram_suggest(&self, word_str: &str, out: &mut Vec<String>) {
        // First step: find 100 stems in the word list with the best ngram score.

        let mut word_buf = Vec::with_capacity(word_str.len());
        let word = CharsStr::new(word_str, &mut word_buf);
        // Overallocate so we probably don't need to reallocate in the loop:
        let mut stem_buf = Vec::with_capacity(word.len_chars() * 2);
        let mut lowercase_stem_buf = Vec::with_capacity(stem_buf.len());
        let mut roots = BinaryHeap::with_capacity(100);
        for entry @ (stem, flagset) in self.checker.words.iter() {
            if flagset.contains(&self.checker.aff.options.forbidden_word_flag)
                || has_flag!(flagset, self.checker.aff.options.no_suggest_flag)
                || has_flag!(flagset, self.checker.aff.options.only_in_compound_flag)
                || flagset.contains(&HIDDEN_HOMONYM_FLAG)
            {
                continue;
            }
            let stem = CharsStr::new(stem.as_str(), &mut stem_buf);
            let mut score =
                left_common_substring_length(&self.checker.aff.options.case_handling, word, stem)
                    as isize;

            // TODO: lowercase into buf so we can reuse this allocation? It would mean copying a
            // lot of code from the standard library unfortunately.
            let lowercase_stem = self
                .checker
                .aff
                .options
                .case_handling
                .lowercase(stem.as_str());
            let lowercase_stem = CharsStr::new(lowercase_stem.as_str(), &mut lowercase_stem_buf);
            score += ngram_similarity_longer_worse(3, word, lowercase_stem);

            let root = MinScored {
                score,
                inner: entry,
            };
            if roots.len() != 100 {
                roots.push(root);
            } else if roots.peek().is_some_and(|entry| score > entry.score) {
                // The heap has hit capacity. Drop the lowest scoring root and push this new
                // higher scored root.
                roots.pop();
                roots.push(root);
            }
        }

        // Calculate a somewhat low threshold score so that we can ignore bad suggestions in the
        // next steps.
        let mut mangled_word = String::new();
        let mut threshold = 0isize;
        for k_byte_idx in word.char_indices().skip(1).take(3) {
            let k_byte_idx = *k_byte_idx as usize;
            mangled_word.clear();
            mangled_word.push_str(&word_str[..k_byte_idx]);
            mangled_word.extend(word_str[k_byte_idx..].chars().enumerate().map(|(i, ch)| {
                if i % 4 == 0 {
                    '*'
                } else {
                    ch
                }
            }));
            threshold += ngram_similarity_any_mismatch(word.len_chars(), word, &mangled_word);
        }

        threshold /= 3;

        // Step two: expand the affixes for these wordlist entries, gathering the 200 highest
        // scoring candidates.

        let mut expanded_list = Vec::new();
        let mut expanded_cross_affix = Vec::new();
        let mut expanded_word_buf = Vec::with_capacity(word.len_chars() * 2);
        let mut guess_words = BinaryHeap::new();

        for MinScored {
            inner: (stem, flags),
            ..
        } in roots
        {
            expanded_cross_affix.clear();
            self.expand_stem_for_ngram(
                stem.as_str(),
                flags,
                word_str,
                &mut expanded_list,
                &mut expanded_cross_affix,
            );
            for expanded_word in expanded_list.drain(..) {
                let mut score = left_common_substring_length(
                    &self.checker.aff.options.case_handling,
                    word,
                    CharsStr::new(&expanded_word, &mut expanded_word_buf),
                ) as isize;

                let lower_expanded_word = self
                    .checker
                    .aff
                    .options
                    .case_handling
                    .lowercase(&expanded_word);
                score +=
                    ngram_similarity_any_mismatch(word.len_chars(), word, &lower_expanded_word);

                if score < threshold {
                    continue;
                }

                let guess_word = MinScored {
                    score,
                    inner: expanded_word,
                };
                if guess_words.len() != 200 {
                    guess_words.push(guess_word);
                } else if guess_words.peek().is_some_and(|entry| score > entry.score) {
                    guess_words.pop();
                    guess_words.push(guess_word);
                }
            }
        }

        // Step three: rescore these up to 200 potential matches based on a weighted ngram
        // calculation and other bespoke measurements.

        // Scratchpad vector used for calculating longest common subsequences. See
        // `longest_common_subsequence_length`.
        let mut lcs_state = Vec::new();
        // Nuspell questions whether or not the heap needs to be sorted before iterating.
        // For now, they sort the heap. I think Nuspell is correct to do so because the `break`
        // below could cause a different end behavior based on whether we're iterating on a sorted
        // or unsorted vec. Note that we are sorting in descending order here. In my manual
        // testing, using `BinaryHeap::into_vec` instead produces no noticeable difference.
        let mut guess_words = guess_words.into_sorted_vec();
        let mut lower_guess_word_buf = Vec::with_capacity(word.len_chars());
        // `iter_mut` because this loop modifies the `score`.
        for MinScored {
            score,
            inner: guess_word,
        } in guess_words.iter_mut()
        {
            let lower_guess_word = self.checker.aff.options.case_handling.lowercase(guess_word);
            let lower_guess_word = CharsStr::new(&lower_guess_word, &mut lower_guess_word_buf);

            let lcs = longest_common_subsequence_length(word, lower_guess_word, &mut lcs_state);

            if word.len_chars() == lower_guess_word.len_chars() && word.len_chars() == lcs {
                *score += 2000;
                break;
            }

            let mut ngram2 = ngram_similarity_any_mismatch_weighted(2, word, lower_guess_word);
            ngram2 += ngram_similarity_any_mismatch_weighted(2, lower_guess_word, word);
            let ngram4 = ngram_similarity_any_mismatch(4, word, lower_guess_word.as_str());

            let left_common = left_common_substring_length(
                &self.checker.aff.options.case_handling,
                word,
                lower_guess_word,
            );

            let (num_eq_chars_same_pos, eq_char_is_swapped) =
                count_eq_at_same_pos(word, lower_guess_word);

            *score = 2 * lcs as isize;
            *score -= (word.len_chars() as isize - lower_guess_word.len_chars() as isize).abs();
            *score += left_common as isize + ngram2 + ngram4;
            if num_eq_chars_same_pos != 0 {
                *score += 1;
            }
            if eq_char_is_swapped {
                *score += 10;
            }

            if 5 * ngram2
                < ((word.len_chars() + lower_guess_word.len_chars())
                    * (10 - self.checker.aff.options.max_diff_factor as usize))
                    as isize
            {
                *score -= 1000;
            }
        }

        // We've updated the scores (`iter_mut` above) so we need to re-sort the Vec.
        // Note that because of `MinScored<T>`'s `Ord` implementation the Vec is ordered by
        // score descending. (Normally a sort would be ascending.)
        guess_words.sort_unstable();

        // Step four: push the most promising of the candidates to `out`.

        let be_more_selective = guess_words.first().is_some_and(|guess| guess.score > 1000);
        let old_num_suggestions = out.len();
        let max_suggestions = MAX_SUGGESTIONS
            .min(old_num_suggestions + self.checker.aff.options.max_ngram_suggestions as usize);
        for MinScored {
            score,
            inner: guess_word,
        } in guess_words.into_iter()
        {
            if out.len() == max_suggestions {
                break;
            }
            // Note that we are iterating in descending score order, so this sets a minimum.
            if be_more_selective && score <= 1000 {
                break;
            }
            if score < -100
                && (old_num_suggestions != out.len() || self.checker.aff.options.only_max_diff)
            {
                break;
            }
            // Nuspell converts back to UTF-8 here but we've been working with UTF-8 all along.
            if out.iter().any(|sug| guess_word.contains(sug)) {
                if score < -100 {
                    break;
                } else {
                    continue;
                }
            }
            out.push(guess_word);
        }
    }

    fn expand_stem_for_ngram(
        &self,
        stem: &str,
        flags: &FlagSet,
        word: &str,
        expanded_list: &mut Vec<String>,
        cross_affix: &mut Vec<bool>,
    ) {
        expanded_list.clear();
        cross_affix.clear();

        if !has_flag!(flags, self.checker.aff.options.need_affix_flag) {
            expanded_list.push(String::from(stem));
            cross_affix.push(false);
        }

        if flags.is_empty() {
            return;
        }

        // TODO: investigate collecting `all_flags` (like we do for compounds IIRC) on the
        // prefixes and suffixes tables to see if we can disqualify flagsets faster?

        for suffix in self.checker.aff.suffixes.iter() {
            // Nuspell:
            // if (!cross_valid_inner_outer(flags, suffix))
            // 	continue;
            if !flags.contains(&suffix.flag) {
                continue;
            }
            if !self.checker.is_outer_affix_valid::<_, FULL_WORD>(suffix) {
                continue;
            }
            if self.checker.is_circumfix(suffix) {
                continue;
            }
            // Nuspell has a todo here:
            // > Suffixes marked with needaffix or circumfix should not just be skipped as we can
            // > later add prefix. This is not handled in hunspell, too.
            if suffix
                .strip
                .as_ref()
                .is_some_and(|suf| !stem.ends_with(&**suf))
            {
                continue;
            }
            if !suffix.condition_matches(stem) {
                continue;
            }
            if !suffix.add.is_empty() && !word.ends_with(&*suffix.add) {
                continue;
            }

            let expanded = suffix.to_derived(stem);
            expanded_list.push(expanded);
            cross_affix.push(suffix.crossproduct);
        }

        // Expand crossproduct words - prefixes for suffix-expanded words marked with
        // `crossproduct: true`.
        for i in 0..expanded_list.len() {
            if !cross_affix[i] {
                continue;
            }

            for prefix in self.checker.aff.prefixes.iter() {
                let suffixed_stem = &expanded_list[i];
                // if (!cross_valid_inner_outer(flags, prefix))
                // 	continue;
                if !flags.contains(&prefix.flag) {
                    continue;
                }
                if !self.checker.is_outer_affix_valid::<_, FULL_WORD>(prefix) {
                    continue;
                }
                if self.checker.is_circumfix(prefix) {
                    continue;
                }
                if prefix
                    .strip
                    .as_ref()
                    .is_some_and(|pre| !suffixed_stem.starts_with(&**pre))
                {
                    continue;
                }
                if !prefix.condition_matches(suffixed_stem) {
                    continue;
                }
                if !prefix.add.is_empty() && !word.starts_with(&*prefix.add) {
                    continue;
                }

                let expanded = prefix.to_derived(stem);
                expanded_list.push(expanded);
            }
        }

        for prefix in self.checker.aff.prefixes.iter() {
            // Nuspell:
            // if (!cross_valid_inner_outer(flags, prefix))
            // 	continue;
            if !flags.contains(&prefix.flag) {
                continue;
            }
            if !self.checker.is_outer_affix_valid::<_, FULL_WORD>(prefix) {
                continue;
            }
            if self.checker.is_circumfix(prefix) {
                continue;
            }
            if prefix
                .strip
                .as_ref()
                .is_some_and(|pre| !stem.starts_with(&**pre))
            {
                continue;
            }
            if !prefix.condition_matches(stem) {
                continue;
            }
            if !prefix.add.is_empty() && !word.starts_with(&*prefix.add) {
                continue;
            }

            let expanded = prefix.to_derived(stem);
            expanded_list.push(expanded);
        }
    }
}

/// A borrowed string (`&str`) wrapper which eagerly computes `core::str::char_indices`.
///
/// With this type it is cheap both to ask for the number of Unicode characters in the string
/// (`len_chars`) and to subslice the string by _character index_.
///
/// Compare with a regular `&str`: the number of characters can be counted with
/// `string.chars().count()` - a linear operation w.r.t. the length of the string. Subslicing
/// by character indices can't be done directly on a `&str`.
///
/// # Some discussion on UTF-8 vs. UTF-32
///
/// Nuspell uses UTF-32 for the ngram similarity section of the suggester because you can iterate
/// and index on characters easily. (In UTF-32 every index is a character because in Unicode you
/// need at most 32 bits to represent any character. (Note: not the same as a grapheme cluster
/// like 🏴‍☠️).) The C++ standard library seems to be better optimized for UTF-32 operations (I
/// believe a `std::u32_string` and equivalent string views), specifically
/// std::basic_string_view<char32_t>`'s `find` which is central to `ngram_similarity`. We now
/// use `str::contains(kgram)` equivalently and it performs much better than
/// `&[char]::starts_with` plus indexing.
///
/// Rust's UTF-8 methods are well optimized. Specifically, taking advantage of `memchr` and SIMD
/// to search `str`s. We retain a performance advantage by staying in UTF-8 and avoiding the more
/// generically (read: dumbly) implemented `Eq for &[T]` or `&[T: Eq]::starts_with`.
///
/// In Spellbook's history this module was first implemented to work with UTF-32. Switching to
/// UTF-8 yielded an impressive 25% speed up for suggesting wrong words like "exmaple" which end
/// up in the `ngram_similarity` loop.
///
/// <details><summary>What's curious about "exmaple" specifically?...</summary>
///
/// "ngram similarity" is a kind of 'layer cake' when it comes to performance. The more similar
/// the input strings, the more you have to compare. First you start at `k=1` and check how many
/// times each character in `left` appears in `right`. If that happens more than twice for any
/// character you check how many times any two-character (`k=2`) combo in `left` appears in
/// `right`. If that happens more than twice then you move onto layer 3 (`k=3`).
///
/// This means that when `left` contains two or more of a common letter for the language - like
/// "e" is in English - then the `ngram_similarity` function does much more work.
///
/// `ngram_similarity` relies on finding sub-`&str`s in `right`, so the faster that you can
/// determine whether a kgram of `left` appears in `right`, the faster `ngram_similarity` will be.
///
/// </details>
///
/// The original implementation eagerly collected `&str`s into `&[char]` once - paying the cost
/// of converting to UTF-32 once. This type is similar - we only run a character iterator over
/// the string once. But we can take advantage of optimized UTF-8 comparison tools in the standard
/// library.
///
/// # Lifetimes
///
/// For performance reasons this struct borrows a slab of memory from a `Vec<u16>` which is
/// hopefully instantiated many fewer times than this struct. See `CharsStr::new` docs.
#[derive(Clone, Copy)]
struct CharsStr<'s, 'i> {
    inner: &'s str,
    char_indices: &'i [u16],
}

impl<'s, 'i> CharsStr<'s, 'i> {
    /// Creates a `CharsStr`, borrowing the input `&str` and `slab`'s allocation.
    ///
    /// Taking in `slab` here is weird - it's not a nice API. We do it because otherwise we'd need
    /// to allocate a new, short-lived `Vec<u16>` for every stem in the WordList. Short lived
    /// allocations are bad for performance - `alloc`/`dealloc` from the Rust standard library
    /// (and their implementations way down to the kernel) are usually slow compared to
    /// stack-based operations. It's usually better to reuse a (probably overallocated) "block" /
    /// "slab" of memory, and, intuitively, the savings get better the more you reuse.
    ///
    /// Above in `ngram_suggest`, the vector used as `slab` is reused across iterations of
    /// stems in the wordlist for example. In fact we allocate a vector just for the stem's
    /// `CharsStr`s. Instead of allocating per iteration of that loop we allocate once and
    /// probably reallocate very rarely.
    fn new(s: &'s str, slab: &'i mut Vec<u16>) -> Self {
        let len_bytes = s.len();
        // Note: number of bytes is greater than or equal to number of chars, so all `as u16`
        // conversions are safe after this assertion. (Equal when the string is ASCII only.)
        assert!(len_bytes <= u16::MAX as usize);
        slab.clear();
        slab.extend(s.char_indices().map(|(i, _ch)| i as u16));
        // Push the length so that we can make exclusive ranges out of the windows of the
        // `char_indices` slice.
        slab.push(len_bytes as u16);

        Self {
            inner: s,
            char_indices: slab.as_slice(),
        }
    }

    const fn len_chars(&self) -> usize {
        // We push an extra element for the total `self.inner.len()`.
        self.char_indices.len() - 1
    }

    const fn is_empty(&self) -> bool {
        // As above, we pushed the extra element. So when `self.inner` is empty,
        // `self.char_indices` should be `&[0u16]`.
        self.char_indices.len() == 1
    }

    const fn as_str(&self) -> &str {
        self.inner
    }

    /// Returns a `&str` subslice containing all of the characters in the given _character_ range.
    ///
    /// Note that this method takes character indices and not byte indices.
    fn char_slice(&self, char_range: core::ops::Range<usize>) -> &str {
        let start_byte = self.char_indices[char_range.start] as usize;
        let end_byte = self.char_indices[char_range.end] as usize;
        // SAFETY: the caller is in charge of providing char indices that are in bounds of the
        // `self.char_indices` array. (Those accesses are bounds checked.) All byte indices in
        // `self.char_indices` are valid.
        // Unfortunately the bounds checks cost a noticeable amount on a flamegraph, so we prefer
        // the unsafe version to `&self.inner[start_byte..end_byte]`.
        unsafe { self.inner.get_unchecked(start_byte..end_byte) }
    }

    fn char_at(&self, char_idx: usize) -> &str {
        let start_byte = self.char_indices[char_idx] as usize;
        let end_byte = self.char_indices[char_idx + 1] as usize;

        // SAFETY: Same as above. All byte indices in `self.char_indices` are valid and the above
        // accesses are checked.
        unsafe { self.inner.get_unchecked(start_byte..end_byte) }
    }

    fn char_iter(&self) -> impl Iterator<Item = &'s str> + '_ {
        // SAFETY: as above, all byte indices in `self.char_indices` are valid. `slice::windows`
        // always produces valid indices into the slice, so all of these accesses can be done
        // unchecked safely.
        self.char_indices.windows(2).map(|idxs| unsafe {
            let start = *idxs.get_unchecked(0) as usize;
            let end = *idxs.get_unchecked(1) as usize;
            self.inner.get_unchecked(start..end)
        })
    }

    fn char_indices(&self) -> slice::Iter<'_, u16> {
        self.char_indices.iter()
    }
}

fn left_common_substring_length(
    case_handling: &CaseHandling,
    left: CharsStr,
    right: CharsStr,
) -> usize {
    let mut left_chars = left.as_str().chars();
    let mut right_chars = right.as_str().chars();

    let Some((l, r)) = left_chars.next().zip(right_chars.next()) else {
        return 0;
    };

    if l != r && !case_handling.is_char_eq_lowercase(l, r) {
        return 0;
    }

    index_of_mismatch(left_chars, right_chars)
        .map(|idx| idx + 1)
        .unwrap_or(left.len_chars())
}

fn index_of_mismatch<T: Eq, I: Iterator<Item = T>>(left: I, mut right: I) -> Option<usize> {
    left.enumerate().find_map(|(idx, l)| match right.next() {
        Some(r) if r == l => None,
        _ => Some(idx),
    })
}

fn ngram_similarity_longer_worse(n: usize, left: CharsStr, right: CharsStr) -> isize {
    if right.is_empty() {
        return 0;
    }
    let mut score = ngram_similarity(n, left, right.as_str());
    let d = (right.len_chars() as isize - left.len_chars() as isize) - 2;
    if d > 0 {
        score -= d;
    }
    score
}

// Nuspell calls this `ngram_similarity_low_level`.
fn ngram_similarity(n: usize, left: CharsStr, right: &str) -> isize {
    let n = n.min(left.len_chars());
    let mut score = 0;

    for k in 1..=n {
        let mut k_score = 0;
        for i in 0..=left.len_chars() - k {
            let kgram = left.char_slice(i..i + k);
            if right.contains(kgram) {
                k_score += 1;
            }
        }
        score += k_score;
        if k_score < 2 {
            break;
        }
    }

    score
}

fn ngram_similarity_any_mismatch(n: usize, left: CharsStr, right: &str) -> isize {
    if right.is_empty() {
        return 0;
    }
    let mut score = ngram_similarity(n, left, right);
    let d = (right.chars().count() as isize - left.len_chars() as isize).abs() - 2;
    if d > 0 {
        score -= d;
    }
    score
}

// Nuspell returns an isize.
fn longest_common_subsequence_length(
    left: CharsStr,
    right: CharsStr,
    state_buffer: &mut Vec<usize>,
) -> usize {
    state_buffer.clear();
    state_buffer.resize(right.len_chars(), 0);

    let mut row1_prev = 0;
    for l in left.char_iter() {
        row1_prev = 0;
        let mut row2_prev = 0;
        for (j, row2_current) in state_buffer.iter_mut().enumerate().take(right.len_chars()) {
            let row1_current = *row2_current;
            *row2_current = if l == right.char_at(j) {
                row1_prev + 1
            } else {
                row1_current.max(row2_prev)
            };
            row1_prev = row1_current;
            row2_prev = *row2_current;
        }
        row1_prev = row2_prev;
    }

    row1_prev
}

fn ngram_similarity_any_mismatch_weighted(n: usize, left: CharsStr, right: CharsStr) -> isize {
    if right.is_empty() {
        return 0;
    }
    let mut score = ngram_similarity_weighted(n, left, right.as_str());
    let d = (right.len_chars() as isize - left.len_chars() as isize).abs() - 2;
    if d > 0 {
        score -= d;
    }
    score
}

fn ngram_similarity_weighted(n: usize, left: CharsStr, right: &str) -> isize {
    let n = n.min(left.len_chars());
    let mut score = 0;

    for k in 1..=n {
        let mut k_score = 0;
        for i in 0..=left.len_chars() - k {
            let kgram = left.char_slice(i..i + k);
            if right.contains(kgram) {
                k_score += 1;
            } else {
                k_score -= 1;
                if i == 0 || i == left.len_chars() - k {
                    k_score -= 1;
                }
            }
        }
        score += k_score;
    }

    score
}

fn count_eq_at_same_pos(left: CharsStr, right: CharsStr) -> (usize, bool) {
    let n = left.len_chars().min(right.len_chars());
    let count = left
        .char_iter()
        .zip(right.char_iter())
        .filter(|(l, r)| l == r)
        .count();

    let mut is_swap = false;
    // Only two characters are not equal. Check if they were swapped.
    if left.len_chars() == right.len_chars() && n - count == 2 {
        let mut first_mismatch = None;
        for (l, r) in left.char_iter().zip(right.char_iter()) {
            if l != r {
                if let Some((l1, r1)) = first_mismatch {
                    is_swap = l1 == r && r1 == l;
                    break;
                }
                first_mismatch = Some((l, r));
            }
        }
    }

    (count, is_swap)
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn index_of_mismatch_test() {
        assert_eq!(index_of_mismatch(b"abcd".iter(), b"abcd".iter()), None);
        assert_eq!(index_of_mismatch(b"abcd".iter(), b"abxy".iter()), Some(2));
        assert_eq!(index_of_mismatch(b"abcd".iter(), b"abc".iter()), Some(3));
        assert_eq!(index_of_mismatch(b"abc".iter(), b"abcd".iter()), None);
    }

    #[test]
    fn nagrm_similarity_test() {
        // Rebuilding the Spellchecker:
        // > ngram(3, 'actually', 'akchualy')
        // > 11 = a, c, u, a, l, l, y, ua, al, ly, ual
        let mut left_buf = Vec::new();
        let left = CharsStr::new("actually", &mut left_buf);
        assert_eq!(ngram_similarity(3, left, "akchualy"), 11);
    }

    #[test]
    fn longest_common_subsequence_length_test() {
        let mut left_buffer = Vec::new();
        let mut right_buffer = Vec::new();
        let mut state_buffer = Vec::new();
        assert_eq!(
            longest_common_subsequence_length(
                CharsStr::new("aaa", &mut left_buffer),
                CharsStr::new("aaa", &mut right_buffer),
                &mut state_buffer
            ),
            3
        );
        assert_eq!(
            longest_common_subsequence_length(
                CharsStr::new("aaaaa", &mut left_buffer),
                CharsStr::new("bbbaa", &mut right_buffer),
                &mut state_buffer
            ),
            2
        );
    }

    #[test]
    fn count_eq_at_same_pos_test() {
        let mut left_buffer = Vec::new();
        let mut right_buffer = Vec::new();
        assert_eq!(
            count_eq_at_same_pos(
                CharsStr::new("abcd", &mut left_buffer),
                CharsStr::new("abcd", &mut right_buffer),
            ),
            (4, false)
        );
        assert_eq!(
            count_eq_at_same_pos(
                CharsStr::new("abcd", &mut left_buffer),
                CharsStr::new("acbd", &mut right_buffer),
            ),
            (2, true)
        );
    }
}