symspell_rs 6.8.3

Spelling correction & Fuzzy search
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// SymSpell supports compound splitting / decompounding of multi-word input strings with three cases:
// 1. mistakenly inserted space into a correct word led to two incorrect terms
// 2. mistakenly omitted space between two correct words led to one incorrect combined term
// 3. multiple independent input terms with/without spelling errors

// Copyright (C) 2025 Wolf Garbe
// Version: 6.7.3
// Author: Wolf Garbe wolf.garbe@seekstorm.com
// Maintainer: Wolf Garbe wolf.garbe@seekstorm.com
// URL: https://github.com/wolfgarbe/symspell
// Description: https://seekstorm.com/blog/1000x-spelling-correction/
//
// MIT License
// Copyright (c) 2025 Wolf Garbe
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
// documentation files (the "Software"), to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
// and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
// https://opensource.org/licenses/MIT

use ahash::{AHashMap, AHashSet};
use itertools::Itertools;
use std::cmp;
use std::cmp::Ordering;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
use unicode_normalization::UnicodeNormalization;

#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
use ahash::RandomState;
#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
use std::sync::LazyLock;

#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
pub static HASHER_32: LazyLock<RandomState> =
    LazyLock::new(|| RandomState::with_seeds(805272099, 242851902, 646123436, 591410655));

// stable hash, faster, but not available on all platforms
// https://github.com/tkaitchuck/aHash
#[inline]
#[cfg(all(target_feature = "aes", target_feature = "sse2"))]
pub(crate) fn hash32(term_bytes: &[u8]) -> u32 {
    use gxhash::gxhash32;

    gxhash32(term_bytes, 1234)
}

// unstable hash, slower, but available on all platforms
// https://github.com/ogxd/gxhash
#[inline]
#[cfg(not(all(target_feature = "aes", target_feature = "sse2")))]
pub(crate) fn hash32(term_bytes: &[u8]) -> u32 {
    HASHER_32.hash_one(term_bytes) as u32
}

use std::{cmp::min, mem};

use smallvec::SmallVec;
use smallvec::smallvec;

const VEC_SIZE: usize = 16; //32
pub type FastVec<T> = SmallVec<[T; VEC_SIZE]>;

/// Damerau-Levenshtein edit distance, like Levenshtein but allows for adjacent transpositions.
/// Optimal string alignment version (OSA): each substring can only be edited once.
/// E.g., "CA" to "ABC" has an edit distance of 2 by for Damerau-Levenshtein, but a distance of 3 when using the optimal string alignment algorithm.
/// Returns the edit distance, >= 0 representing the number of edits required to transform one string to the other,
/// or -1 if the distance is greater than the specified max_distance.
/// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
pub fn damerau_levenshtein_osa(a: &str, b: &str, max_distance: usize) -> Option<usize> {
    let b_len = b.chars().count();

    //the edit distance can't be less than the difference of the lengths of the strings.
    //if a.chars().count().abs_diff(b_len)> max_distance {return -1;}

    // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size
    // this leads to significantly worse code gen when swapping the vectors below
    let mut prev_two_distances: FastVec<usize> = (0..b_len + 1).collect();
    let mut prev_distances: FastVec<usize> = (0..b_len + 1).collect();
    let mut curr_distances: FastVec<usize> = smallvec![0; b_len + 1];

    let mut prev_a_char = char::MAX;
    let mut prev_b_char = char::MAX;

    for (i, a_char) in a.chars().enumerate() {
        curr_distances[0] = i + 1;

        for (j, b_char) in b.chars().enumerate() {
            let cost = usize::from(a_char != b_char);
            curr_distances[j + 1] = min(
                curr_distances[j] + 1,
                min(prev_distances[j + 1] + 1, prev_distances[j] + cost),
            );
            if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char
            {
                curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1);
            }

            prev_b_char = b_char;
        }

        mem::swap(&mut prev_two_distances, &mut prev_distances);
        mem::swap(&mut prev_distances, &mut curr_distances);
        prev_a_char = a_char;
    }

    if prev_distances[b_len] <= max_distance {
        Some(prev_distances[b_len])
    } else {
        None
    }
}

/// Normalize ligatures: "scientific" "fields" "final"
pub fn unicode_normalization_form_kc(input: &str) -> String {
    input
        .nfkc() // Apply Unicode Normalization Form KC
        .collect::<String>() // Collect normalized chars into a String
}

/// Transfer the letter case char-wise from source to target string.
pub fn transfer_case(source: &str, target: &str) -> String {
    // source = "HeLLo WoRLd!";
    // target = "rustacean community!";
    // result = "RuSTacEaN community!";

    //shortcut: if input and suggestion are identical or case-insensitive identical, no transfer required
    if source == target || source.eq_ignore_ascii_case(target) {
        return source.to_string();
    }

    let mut result = String::new();

    // iterate over both strings using zip_longest from itertools
    use itertools::EitherOrBoth;
    use itertools::Itertools;
    let mut last_upper = false;

    for pair in source.chars().zip_longest(target.chars()) {
        match pair {
            // both characters exist
            EitherOrBoth::Both(s, t) => {
                if s.is_uppercase() {
                    //don't memorize letter case for first character
                    if !result.is_empty() {
                        last_upper = true;
                    }
                    result.push_str(&t.to_string().to_uppercase());
                }
                /*
                // we don't need to lowercase, because dictionary words are already lowercased
                else if s.is_lowercase() {
                    result.push_str(&t.to_string().to_lowercase());
                    result.push(t);
                }
                */
                // don't transfer lower case of whitespace in source to non-whitespace char in target
                else if s.is_whitespace() && !t.is_whitespace() && last_upper {
                    result.push_str(&t.to_string().to_uppercase());
                } else {
                    last_upper = false;
                    result.push(t);
                }
            }
            // only the source has characters left — just append them as-is
            EitherOrBoth::Left(_) => (),
            // only the target has characters left — append unchanged
            EitherOrBoth::Right(t) => {
                //use memorizeed case from last char for exceeding chars
                if last_upper {
                    result.push_str(&t.to_string().to_uppercase());
                } else {
                    result.push(t);
                }
            }
        }
    }
    result
}

/// Parse a string into words, splitting at non-alphanumeric characters, except for underscore and apostrophes.
pub fn parse_words(text: &str) -> Vec<String> {
    let mut non_unique_terms_line: Vec<String> = Vec::with_capacity(text.len() << 3);
    let text_normalized = text.to_lowercase();
    let mut start = false;
    let mut start_pos = 0;

    for char in text_normalized.char_indices() {
        start = match char.1 {
            //start of term
            token if token.is_alphanumeric() => {
                //token if regex_syntax::is_word_character(token) => {
                if !start {
                    start_pos = char.0;
                }
                true
            }

            // allows underscore and apostrophes as part of the word
            '_' | '\'' | '' => true,

            //end of term
            _ => {
                if start {
                    non_unique_terms_line.push(text_normalized[start_pos..char.0].to_string());
                }
                false
            }
        };
    }

    if start {
        non_unique_terms_line.push(text_normalized[start_pos..text_normalized.len()].to_string());
    }

    non_unique_terms_line
}

fn len(s: &str) -> usize {
    s.chars().count()
}

fn remove(s: &str, index: usize) -> String {
    s.chars()
        .enumerate()
        .filter(|(ii, _)| ii != &index)
        .map(|(_, ch)| ch)
        .collect()
}

fn slice(s: &str, start: usize, end: usize) -> String {
    s.chars().skip(start).take(end - start).collect()
}

fn suffix(s: &str, start: usize) -> String {
    s.chars().skip(start).collect::<String>()
}

fn at(s: &str, i: isize) -> Option<char> {
    if i < 0 || i >= s.len() as isize {
        return None;
    }

    s.chars().nth(i as usize)
}

#[derive(Debug, Clone)]
/// Result of word segmentation.
pub struct Composition {
    /// the word segmented and spelling corrected string,
    pub segmented_string: String,
    /// the Edit distance sum between input string and corrected string,
    pub distance_sum: usize,
    /// the Sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
    pub prob_log_sum: f64,
}

impl Composition {
    fn empty() -> Self {
        Self {
            segmented_string: "".to_string(),
            distance_sum: 0,
            prob_log_sum: 0.0,
        }
    }
}

#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
/// Suggested correct spelling for a given input word.
pub struct Suggestion {
    /// The suggested correctly spelled word.
    pub term: String,
    /// Edit distance between searched for word and suggestion.
    pub distance: usize,
    /// Frequency of suggestion in the dictionary (a measure of how common the word is).
    pub count: usize,
}

impl Suggestion {
    fn empty() -> Suggestion {
        Suggestion {
            term: "".to_string(),
            distance: 0,
            count: 0,
        }
    }

    fn new(term: impl Into<String>, distance: usize, count: usize) -> Suggestion {
        Suggestion {
            term: term.into(),
            distance,
            count,
        }
    }
}

// Order by distance ascending, then by frequency count descending
impl Ord for Suggestion {
    fn cmp(&self, other: &Suggestion) -> Ordering {
        let distance_cmp = self.distance.cmp(&other.distance);
        if distance_cmp == Ordering::Equal {
            return other.count.cmp(&self.count);
        }
        distance_cmp
    }
}

impl PartialOrd for Suggestion {
    fn partial_cmp(&self, other: &Suggestion) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl PartialEq for Suggestion {
    fn eq(&self, other: &Suggestion) -> bool {
        self.distance == other.distance && self.count == other.count
    }
}
impl Eq for Suggestion {}

#[derive(Eq, PartialEq, Debug)]
/// Controls the closeness/quantity of returned spelling suggestions.
pub enum Verbosity {
    /// Top suggestion with the highest term frequency of the suggestions of smallest edit distance found.
    Top,
    /// All suggestions of smallest edit distance found, suggestions ordered by term frequency.
    Closest,
    /// All suggestions within maxEditDistance, suggestions ordered by edit distance, then by term frequency (slower, no early termination)
    All,
}

#[derive(PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
/// SymSpell spell checker and corrector.
pub struct SymSpell {
    /// Maximum edit distance for dictionary precalculation.
    max_dictionary_edit_distance: usize,
    /// Term length thresholds for each edit distance
    ///   None: max_dictionary_edit_distance for all terms lengths
    ///   Some([4].into()): max_dictionary_edit_distance for all terms lengths >= 4,
    ///   Some([2,8].into()): max_dictionary_edit_distance for all terms lengths >=2, max_dictionary_edit_distance +1 for all terms for lengths>=8
    ///   ⚠️ The resulting maximum edit distance defined in lookup() for each term length (max_edit_distance + term_length_threshold) must be
    ///   <= the corresponding maximum edit distance defined for each term length in SymSpell::new() used for creation of dictionary structures.
    term_length_threshold: Option<Vec<usize>>,
    /// The length of word prefixes, from which deletes are generated. (5..7).
    prefix_length: usize,
    /// The minimum frequency count for dictionary words to be considered a valid for spelling correction.
    count_threshold: usize,
    /// Number of all words in the corpus used to generate the frequency dictionary
    /// this is used to calculate the word occurrence probability p from word counts c : p=c/N
    /// N equals the sum of all counts c in the dictionary only if the dictionary is complete, but not if the dictionary is truncated or filtered
    corpus_word_count: usize,
    /// Maximum dictionary term length
    max_dictionary_term_length: usize,
    /// Dictionary that contains a mapping of lists of suggested correction words to the hashCodes
    /// of the original words and the deletes derived from them. Collisions of hashCodes is tolerated,
    /// because suggestions are ultimately verified via an edit distance function.
    /// A list of suggestions might have a single suggestion, or multiple suggestions.
    deletes: AHashMap<u32, Vec<Box<str>>>,
    // Dictionary of unique correct spelling words, and the frequency count for each word.
    words: AHashMap<Box<str>, usize>,
    /// Bigrams optionally used for improved correction quality in lookup_coompound
    bigrams: AHashMap<Box<str>, usize>,
    /// Minimum bigram count in the bigram dictionary
    bigram_min_count: usize,
}

// inexpensive and language independent: only deletes, no transposes + replaces + inserts
// replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
fn edits(
    word: &str,
    edit_distance: usize,
    max_term_edit_distance: usize,
    delete_words: &mut AHashSet<String>,
) {
    let edit_distance = edit_distance + 1;
    let word_len = len(word);

    if word_len > 1 {
        for i in 0..word_len {
            let delete = remove(word, i);

            if !delete_words.contains(&delete) {
                delete_words.insert(delete.clone());

                //max_term_edit_distance dependent on term_length_threshold
                if edit_distance < max_term_edit_distance {
                    edits(&delete, edit_distance, max_term_edit_distance, delete_words);
                }
            }
        }
    }
}

impl SymSpell {
    /// Creates a new SymSpell instance.
    ///  
    /// # Arguments
    ///
    /// * `max_dictionary_edit_distance` - Maximum edit distance for dictionary precalculation.
    /// * `term_length_threshold` - Term length thresholds for each edit distance.
    ///   None: max_dictionary_edit_distance for all terms lengths
    ///   Some([4].into()): max_dictionary_edit_distance for all terms lengths >= 4,
    ///   Some([2,8].into()): max_dictionary_edit_distance for all terms lengths >=2, max_dictionary_edit_distance +1 for all terms for lengths>=8
    ///   ⚠️ The resulting maximum edit distance defined in lookup() for each term length (max_edit_distance + term_length_threshold) must be
    ///   <= the corresponding maximum edit distance defined for each term length in SymSpell::new() used for creation of dictionary structures.
    /// * `prefix_length` - The length of word prefixes, from which deletes are generated. (5..7).
    /// * `count_threshold` - The minimum frequency count for dictionary words to be considered a valid for spelling correction.
    pub fn new(
        max_dictionary_edit_distance: usize,
        term_length_threshold: Option<Vec<usize>>,
        prefix_length: usize,
        count_threshold: usize,
    ) -> Self {
        Self {
            max_dictionary_edit_distance, //2
            term_length_threshold,        //vec![]
            prefix_length,                //7
            count_threshold,              //1
            corpus_word_count: 1_024_908_267_229,
            max_dictionary_term_length: 0,
            deletes: AHashMap::new(),
            words: AHashMap::new(),
            bigrams: AHashMap::new(),
            bigram_min_count: usize::MAX,
        }
    }

    /// Get the number of entries in the dictionary.
    pub fn get_dictionary_size(&self) -> usize {
        self.words.len()
    }

    /// Write the dictionary to a CSV file.
    /// Useful when the dictionary was incrementally built/updated with create_dictionary_entry.
    /// Entries are sorted by frequency count descending.
    ///
    /// # Arguments
    ///
    /// * `path` - The path+filename of the file.
    /// * `separator` - Separator between word and frequency
    pub fn save_dictionary(
        &self,
        path: &Path,
        separator: &str,
    ) -> Result<(), Box<dyn std::error::Error>> {
        let file = File::create(path)?;
        let mut writer = BufWriter::new(file);

        for (entry, count) in self
            .words
            .iter()
            .sorted_unstable_by(|a, b| Ord::cmp(&b.1, &a.1))
        {
            writeln!(writer, "{}{}{}", entry, separator, count)?;
        }
        writer.flush()?;

        Ok(())
    }

    /// Load multiple dictionary entries from a file of word/frequency count pairs.
    ///
    /// # Arguments
    ///
    /// * `corpus` - The path+filename of the file.
    /// * `term_index` - The column position of the word.
    /// * `count_index` - The column position of the frequency count.
    /// * `separator` - Separator between word and frequency
    pub fn load_dictionary(
        &mut self,
        path: &Path,
        term_index: usize,
        count_index: usize,
        separator: &str,
    ) -> Result<(), Box<dyn std::error::Error>> {
        let file = File::open(path)?;
        let sr = BufReader::new(file);

        for line in sr.lines() {
            let line_str = line?;
            self.load_dictionary_line(&line_str, term_index, count_index, separator);
        }
        Ok(())
    }

    /// Load single dictionary entry from word/frequency count pair.
    ///
    /// # Arguments
    ///
    /// * `line` - word/frequency pair.
    /// * `term_index` - The column position of the word.
    /// * `count_index` - The column position of the frequency count.
    /// * `separator` - Separator between word and frequency
    pub fn load_dictionary_line(
        &mut self,
        line: &str,
        term_index: usize,
        count_index: usize,
        separator: &str,
    ) -> bool {
        let line_parts: Vec<&str> = line.split(separator).collect();
        if line_parts.len() >= 2 {
            // let key = unidecode(line_parts[term_index as usize]);
            let key = line_parts[term_index].to_string();
            let count = line_parts[count_index].parse::<usize>().unwrap();

            self.create_dictionary_entry(key, count);
        }
        true
    }

    /// Load multiple bigram entries from a file of bigram/frequency count pairs.
    /// Only used in lookup_compound for improved compound splitting/merging/correction quality.
    ///
    /// # Arguments
    ///
    /// * `corpus` - The path+filename of the file.
    /// * `term_index` - The column position of the word.
    /// * `count_index` - The column position of the frequency count.
    /// * `separator` - Separator between word and frequency
    pub fn load_bigram_dictionary(
        &mut self,
        path: &Path,
        term_index: usize,
        count_index: usize,
        separator: &str,
    ) -> Result<(), Box<dyn std::error::Error>> {
        let file = File::open(path)?;
        let sr = BufReader::new(file);
        for line in sr.lines() {
            let line_str = line?;
            self.load_bigram_dictionary_line(&line_str, term_index, count_index, separator);
        }
        Ok(())
    }

    /// Load single dictionary entry from bigram/frequency count pair.
    ///
    /// # Arguments
    ///
    /// * `line` - bigram/frequency pair.
    /// * `term_index` - The column position of the word.
    /// * `count_index` - The column position of the frequency count.
    /// * `separator` - Separator between word and frequency
    pub fn load_bigram_dictionary_line(
        &mut self,
        line: &str,
        term_index: usize,
        count_index: usize,
        separator: &str,
    ) -> bool {
        let line_parts: Vec<&str> = line.split(separator).collect();
        let line_parts_len = if separator == " " { 3 } else { 2 };
        if line_parts.len() >= line_parts_len {
            let key = if separator == " " {
                [line_parts[term_index], line_parts[term_index + 1]].join(" ")
            } else {
                line_parts[term_index].to_string()
            };
            let count = line_parts[count_index].parse::<usize>().unwrap();
            self.bigrams.insert(key.into_boxed_str(), count);
            if count < self.bigram_min_count {
                self.bigram_min_count = count;
            }
        }
        true
    }

    /// Find suggested spellings for a given input word, using the maximum
    /// edit distance specified during construction of the SymSpell dictionary.
    /// Returned suggestions are sorted by distance ascending, then by frequency count descending.
    ///
    /// # Arguments
    ///
    /// * `input` - The word being spell checked. Upper/lower case allowed.
    /// * `verbosity` - The value controlling the quantity/closeness of the retuned suggestions.
    /// * `max_edit_distance` - The maximum edit distance between input and suggested words.
    /// * `term_length_threshold` - Term length thresholds for each edit distance.
    ///   None: max_dictionary_edit_distance for all terms lengths
    ///   Some([4].into()): max_dictionary_edit_distance for all terms lengths >= 4,
    ///   Some([2,8].into()): max_dictionary_edit_distance for all terms lengths >=2, max_dictionary_edit_distance +1 for all terms for lengths>=8
    ///   ⚠️ The resulting maximum edit distance defined in lookup() for each term length (max_edit_distance + term_length_threshold) must be
    ///   <= the corresponding maximum edit distance defined for each term length in SymSpell::new() used for creation of dictionary structures.
    /// * `max_results` - Optional parameter to limit the number of suggestions returned.
    /// * `preserve_case` - Whether to preserve the letter case from input to suggestion.
    ///
    /// # Examples
    ///
    /// ```
    /// use symspell_rs::{SymSpell, Verbosity};
    /// use std::path::Path;
    ///
    /// let mut symspell: SymSpell = SymSpell::new(2,None, 7, 1);
    /// symspell.load_dictionary(Path::new("data/frequency_dictionary_en_82_765.txt"), 0, 1, " ");
    /// symspell.lookup("whatver", Verbosity::Top, 2,&None,None,false);
    /// ```
    pub fn lookup(
        &self,
        input: &str,
        verbosity: Verbosity,
        max_edit_distance: usize,
        term_length_threshold: &Option<Vec<usize>>,
        max_results: Option<usize>,
        preserve_case: bool,
    ) -> Vec<Suggestion> {
        //todo: validate term_length_threshold
        if max_edit_distance + term_length_threshold.as_ref().map_or(0, |x| x.len())
            > self.max_dictionary_edit_distance
                + self.term_length_threshold.as_ref().map_or(0, |x| x.len())
        {
            println!("max_edit_distance is bigger than max_dictionary_edit_distance");
        }

        //len(input)
        //check it for all term lengths vs check it only for current length? eigentlich reicht current, aber dann taucht der fehler später wieder auf

        //max_term_edit_distance dependent on term_length_threshold
        let max_term_edit_distance =
            term_length_threshold
                .as_ref()
                .map_or(max_edit_distance, |term_length_threshold| {
                    let term_len = len(input);
                    let mut max_term_edit_distance = 0;
                    for (i, threshold) in term_length_threshold.iter().enumerate() {
                        if term_len >= *threshold {
                            max_term_edit_distance = max_edit_distance + i
                        } else {
                            break;
                        }
                    }
                    max_term_edit_distance
                });

        let mut suggestions: Vec<Suggestion> = Vec::new();

        let input_lower_case = input.to_lowercase();
        let input_original_case = if preserve_case {
            input
        } else {
            &input_lower_case
        };
        let input = input_lower_case.as_str();

        let input_len = len(input);
        // early termination - word is too big to possibly match any words
        if input_len as isize - max_term_edit_distance as isize
            > self.max_dictionary_term_length as isize
        {
            return suggestions;
        }

        let mut hashset1: AHashSet<String> = AHashSet::new();
        let mut hashset2: AHashSet<String> = AHashSet::new();

        if self.words.contains_key(input) {
            let suggestion_count = self.words[input];
            suggestions.push(Suggestion::new(input_original_case, 0, suggestion_count));
            // early termination - return exact match, unless caller wants all matches
            if verbosity != Verbosity::All {
                return suggestions;
            }
        }

        //early termination, if we only want to check if word in dictionary or get its frequency e.g. for word segmentation
        if max_term_edit_distance == 0 {
            return suggestions;
        }

        hashset2.insert(input.to_string());

        let mut max_edit_distance2 = max_term_edit_distance;
        let mut candidate_pointer = 0;
        let mut candidates = Vec::new();

        let mut input_prefix_len = input_len;

        if input_prefix_len > self.prefix_length {
            input_prefix_len = self.prefix_length;
            candidates.push(slice(input, 0, input_prefix_len));
        } else {
            candidates.push(input.to_string());
        }

        while candidate_pointer < candidates.len() {
            let candidate = &candidates.get(candidate_pointer).unwrap().clone();
            candidate_pointer += 1;
            let candidate_len = len(candidate);
            let length_diff = input_prefix_len as isize - candidate_len as isize;

            //save some time - early termination
            //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
            if length_diff > max_edit_distance2 as isize {
                // skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest
                // (candidates are ordered by delete distance, so none are closer than current)
                if verbosity == Verbosity::All {
                    continue;
                }
                break;
            }

            //read candidate entry from dictionary
            let hash = hash32(candidate.as_bytes());
            if self.deletes.contains_key(&hash) {
                let dict_suggestions = &self.deletes[&hash];

                //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                for suggestion in dict_suggestions {
                    let suggestion_len = len(suggestion);

                    if suggestion.as_ref() == input {
                        continue;
                    }

                    if suggestion_len.abs_diff(input_len) > max_edit_distance2
                        || suggestion_len < candidate_len
                        || (suggestion_len == candidate_len && suggestion.as_ref() != candidate)
                    {
                        continue;
                    }

                    let sugg_prefix_len = min(suggestion_len, self.prefix_length);

                    if sugg_prefix_len > input_prefix_len
                        && sugg_prefix_len - candidate_len > max_edit_distance2
                    {
                        continue;
                    }

                    //Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                    //We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term.
                    //For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance.
                    //For inserts and deletes the resulting edit distance might exceed maxEditDistance.
                    //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                    //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1)
                    //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                    let distance;
                    if candidate_len == 0 {
                        //suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance)
                        distance = cmp::max(input_len, suggestion_len);

                        if distance > max_edit_distance2 || hashset2.contains(suggestion.as_ref()) {
                            continue;
                        }
                        hashset2.insert(suggestion.to_string());
                    } else if suggestion_len == 1 {
                        distance = if !input.contains(&slice(suggestion, 0, 1)) {
                            input_len
                        } else {
                            input_len - 1
                        };

                        if distance > max_edit_distance2 || hashset2.contains(suggestion.as_ref()) {
                            continue;
                        }

                        hashset2.insert(suggestion.to_string());
                    // number of edits in prefix ==maxediddistance  AND no identic suffix,
                    // then editdistance>maxEditDistance and no need for Levenshtein calculation
                    // (inputLen >= prefixLength) && (suggestionLen >= prefixLength)
                    } else if self.has_different_suffix(
                        max_term_edit_distance,
                        input,
                        input_len,
                        candidate_len,
                        suggestion,
                        suggestion_len,
                    ) {
                        continue;
                    } else {
                        // DeleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest.
                        if verbosity != Verbosity::All
                            && !self.delete_in_suggestion_prefix(
                                candidate,
                                candidate_len,
                                suggestion,
                                suggestion_len,
                            )
                        {
                            continue;
                        }

                        if hashset2.contains(suggestion.as_ref()) {
                            continue;
                        }
                        hashset2.insert(suggestion.to_string());

                        distance = if let Some(distance) =
                            damerau_levenshtein_osa(input, suggestion, max_edit_distance2)
                        {
                            distance
                        } else {
                            continue;
                        };
                    }
                    //save some time
                    //do not process higher distances than those already found, if verbosity<All (note: maxEditDistance2 will always equal maxEditDistance when Verbosity::All)
                    if distance <= max_edit_distance2 {
                        let suggestion_count = self.words[suggestion];
                        let si = Suggestion::new(suggestion.as_ref(), distance, suggestion_count);

                        if !suggestions.is_empty() {
                            match verbosity {
                                Verbosity::Closest => {
                                    //we will calculate DamLev distance only to the smallest found distance so far
                                    if distance < max_edit_distance2 {
                                        suggestions.clear();
                                    }
                                }
                                Verbosity::Top => {
                                    if distance < max_edit_distance2
                                        || suggestion_count > suggestions[0].count
                                    {
                                        max_edit_distance2 = distance;
                                        suggestions[0] = si;
                                    }
                                    continue;
                                }
                                _ => (),
                            }
                        }

                        if verbosity != Verbosity::All {
                            max_edit_distance2 = distance;
                        }

                        suggestions.push(si);
                    }
                }
            }

            //add edits
            //derive edits (deletes) from candidate (input) and add them to candidates list
            //this is a recursive process until the maximum edit distance has been reached
            if length_diff < max_term_edit_distance as isize && candidate_len <= self.prefix_length
            {
                //save some time
                //do not create edits with edit distance smaller than suggestions already found
                if verbosity != Verbosity::All && length_diff >= max_edit_distance2 as isize {
                    continue;
                }

                for i in 0..candidate_len {
                    let delete = remove(candidate, i);

                    if !hashset1.contains(&delete) {
                        hashset1.insert(delete.clone());
                        candidates.push(delete);
                    }
                }
            }
        }

        //sort by ascending edit distance, then by descending word frequency
        if suggestions.len() > 1 {
            suggestions.sort_unstable();
        }

        //transfer case from input to suggestion
        if preserve_case {
            for suggestion in suggestions.iter_mut() {
                suggestion.term = transfer_case(input_original_case, &suggestion.term);
            }
        }

        if let Some(max_results) = max_results {
            suggestions.truncate(max_results);
        }
        suggestions
    }

    /// Find suggested spellings for a multi-word input string (supports word splitting/merging).
    /// Returns a list of Suggestion representing suggested correct spellings for the input string.
    ///
    /// lookup_compound supports compound aware automatic spelling correction of multi-word input strings with three cases:
    /// 1. mistakenly inserted space into a correct word led to two incorrect terms
    /// 2. mistakenly omitted space between two correct words led to one incorrect combined term
    /// 3. multiple independent input terms with/without spelling errors
    ///
    /// # Arguments
    ///
    /// * `input` - The sentence being spell checked.
    /// * `max_edit_distance` - The maximum edit distance between input and suggested words.
    /// * `preserve_case` - Whether to preserve the letter case from input to suggestion.
    ///
    /// # Examples
    ///
    /// ```
    /// use symspell_rs::{SymSpell};
    /// use std::path::Path;
    ///
    /// let mut symspell: SymSpell = SymSpell::new(2, None,7, 1);
    /// symspell.load_dictionary(Path::new("data/frequency_dictionary_en_82_765.txt"), 0, 1, " ");
    /// symspell.lookup_compound("whereis th elove", 2, &None,false);
    /// ```
    pub fn lookup_compound(
        &self,
        input: &str,
        edit_distance_max: usize,
        term_length_threshold: &Option<Vec<usize>>,
        preserve_case: bool,
    ) -> Vec<Suggestion> {
        //parse input string into single terms
        let term_list1 = parse_words(input);

        let mut suggestions: Vec<Suggestion>; //suggestions for a single term
        let mut suggestion_parts: Vec<Suggestion> = Vec::new(); //1 line with separate parts

        //translate every term to its best suggestion, otherwise it remains unchanged
        let mut last_combi = false;
        for (i, term) in term_list1.iter().enumerate() {
            suggestions = self.lookup(
                term,
                Verbosity::Top,
                edit_distance_max,
                term_length_threshold,
                None,
                false,
            );

            //combi check, always before split
            if i > 0 && !last_combi {
                let mut suggestions_combi: Vec<Suggestion> = self.lookup(
                    &[term_list1[i - 1].as_str(), term_list1[i].as_str()].join(""),
                    Verbosity::Top,
                    edit_distance_max,
                    term_length_threshold,
                    None,
                    false,
                );

                if !suggestions_combi.is_empty() {
                    let best1 = suggestion_parts[suggestion_parts.len() - 1].clone();
                    let best2 = if !suggestions.is_empty() {
                        suggestions[0].clone()
                    } else {
                        Suggestion::new(
                            //unknown word
                            term_list1[i].as_str(),
                            //estimated edit distance
                            edit_distance_max + 1,
                            // estimated word occurrence probability P=10 / (N * 10^word length l)
                            // estimated word count C=10 / 10^word length l
                            // formulae to calculate the probability of an unknown word proposed by Peter Norvig in Natural Language Corpus Data, page 224 http://norvig.com/ngrams/ch14.pdf
                            // estimated count always 0 if termlength > 3 and independent from corpus_word_count???
                            (10f64 / 10usize.saturating_pow(len(&term_list1[i]) as u32) as f64)
                                as usize,
                        )
                    };

                    //distance1=edit distance between 2 split terms und their best corrections : as comparative value for the combination
                    let distance1 = best1.distance + best2.distance;
                    if suggestions_combi[0].distance + 1 < distance1
                        || (suggestions_combi[0].distance + 1 == distance1
                            && (suggestions_combi[0].count
                                    // best1 / corpus * best1 / corpus * corpus
                                    > (best1.count as f64 / self.corpus_word_count as f64 * best2.count as f64) as usize))
                    {
                        suggestions_combi[0].distance += 1;
                        let last_i = suggestion_parts.len() - 1;
                        suggestion_parts[last_i] = suggestions_combi[0].clone();
                        last_combi = true;
                        continue;
                    }
                }
            }
            last_combi = false;

            //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms
            if !suggestions.is_empty()
                && ((suggestions[0].distance == 0) || (len(&term_list1[i]) == 1))
            {
                //choose best suggestion
                suggestion_parts.push(suggestions[0].clone());
            } else {
                let mut suggestion_split_best = if !suggestions.is_empty() {
                    //add original term
                    suggestions[0].clone()
                } else {
                    //if no perfect suggestion, split word into pairs
                    Suggestion::empty()
                };

                let term_length = len(&term_list1[i]);
                if term_length > 1 {
                    for j in 1..term_length {
                        let part1 = slice(&term_list1[i], 0, j);
                        let part2 = slice(&term_list1[i], j, term_length);
                        let mut suggestion_split = Suggestion::empty();
                        let suggestions1 = self.lookup(
                            &part1,
                            Verbosity::Top,
                            edit_distance_max,
                            term_length_threshold,
                            None,
                            false,
                        );
                        if !suggestions1.is_empty() {
                            let suggestions2 = self.lookup(
                                &part2,
                                Verbosity::Top,
                                edit_distance_max,
                                term_length_threshold,
                                None,
                                false,
                            );

                            if !suggestions2.is_empty() {
                                //select best suggestion for split pair
                                suggestion_split.term =
                                    [suggestions1[0].term.as_str(), suggestions2[0].term.as_str()]
                                        .join(" ");

                                /*
                                let mut distance2 = damerau_levenshtein_osa(
                                    &term_list1[i],
                                    &suggestion_split.term,
                                    edit_distance_max as usize,
                                );

                                if distance2 < 0 {
                                    distance2 = edit_distance_max + 1;
                                }
                                */

                                let distance2 = if let Some(d) = damerau_levenshtein_osa(
                                    &term_list1[i],
                                    &suggestion_split.term,
                                    edit_distance_max,
                                ) {
                                    d
                                } else {
                                    edit_distance_max + 1
                                };

                                if !suggestion_split_best.term.is_empty() {
                                    if distance2 > suggestion_split_best.distance {
                                        continue;
                                    }
                                    if distance2 < suggestion_split_best.distance {
                                        suggestion_split_best = Suggestion::empty();
                                    }
                                }

                                let bigram_count: usize =
                                    match self.bigrams.get(&*suggestion_split.term) {
                                        //if bigram exists in bigram dictionary
                                        Some(&bigram_frequency) => {
                                            //increase count, if split.corrections are part of or identical to input
                                            //single term correction exists
                                            if !suggestions.is_empty() {
                                                let best_si = &suggestions[0];
                                                //alternatively remove the single term from suggestionsSplit, but then other splittings could win
                                                if suggestion_split.term == term_list1[i] {
                                                    //make count bigger than count of single term correction
                                                    cmp::max(bigram_frequency, best_si.count + 2)
                                                } else if suggestions1[0].term == best_si.term
                                                    || suggestions2[0].term == best_si.term
                                                {
                                                    //make count bigger than count of single term correction
                                                    cmp::max(bigram_frequency, best_si.count + 1)
                                                } else {
                                                    bigram_frequency
                                                }
                                            // no single term correction exists
                                            } else if suggestion_split.term == term_list1[i] {
                                                cmp::max(
                                                    bigram_frequency,
                                                    cmp::max(
                                                        suggestions1[0].count,
                                                        suggestions2[0].count,
                                                    ) + 2,
                                                )
                                            } else {
                                                bigram_frequency
                                            }
                                        }
                                        None => {
                                            //The Naive Bayes probability of the word combination is the product of the two word probabilities: P(AB) = P(A) * P(B)
                                            //use it to estimate the frequency count of the combination if no bigram in dictionary found, which then is used to rank/select the best splitting variant
                                            min(
                                                self.bigram_min_count,
                                                (suggestions1[0].count as f64
                                                    / self.corpus_word_count as f64
                                                    * suggestions2[0].count as f64)
                                                    as usize,
                                            )
                                        }
                                    };

                                suggestion_split.distance = distance2;
                                suggestion_split.count = bigram_count;

                                if suggestion_split_best.term.is_empty()
                                    || suggestion_split.count > suggestion_split_best.count
                                {
                                    suggestion_split_best = suggestion_split.clone();
                                }
                            }
                        }
                    }

                    if !suggestion_split_best.term.is_empty() {
                        //select best suggestion for split pair
                        suggestion_parts.push(suggestion_split_best.clone());
                    } else {
                        let mut si = Suggestion::empty();
                        si.term = term_list1[i].clone();
                        // estimated word occurrence probability P=10 / (N * 10^word length l)
                        // estimated word count C=10 / 10^word length l
                        // formulae to calculate the probability of an unknown word proposed by Peter Norvig in Natural Language Corpus Data, page 224 http://norvig.com/ngrams/ch14.pdf
                        // estimated count always 0 if termlength > 3 and independent from corpus_word_count???
                        si.count =
                            (10f64 / 10usize.saturating_pow(term_length as u32) as f64) as usize;
                        si.distance = edit_distance_max + 1;
                        suggestion_parts.push(si);
                    }
                } else {
                    let mut si = Suggestion::empty();
                    si.term = term_list1[i].clone();
                    // estimated word occurrence probability P=10 / (N * 10^word length l)
                    // estimated word count C=10 / 10^word length l
                    // formulae to calculate the probability of an unknown word proposed by Peter Norvig in Natural Language Corpus Data, page 224 http://norvig.com/ngrams/ch14.pdf
                    // estimated count always 0 if termlength > 3 and independent from corpus_word_count???
                    si.count = (10f64 / 10usize.saturating_pow(term_length as u32) as f64) as usize;
                    si.distance = edit_distance_max + 1;
                    suggestion_parts.push(si);
                }
            }
        }

        let mut suggestion = Suggestion::empty();

        let mut tmp_count: f64 = self.corpus_word_count as f64;

        let mut s = "".to_string();
        for si in suggestion_parts {
            s.push_str(&si.term);
            s.push(' ');
            tmp_count *= si.count as f64 / self.corpus_word_count as f64;
        }

        let output = s.trim();
        suggestion.count = tmp_count as usize;
        suggestion.distance =
            damerau_levenshtein_osa(&input.to_lowercase(), output, usize::MAX).unwrap();

        //transfer case from input to suggestion
        suggestion.term = if preserve_case {
            transfer_case(input, output)
        } else {
            output.to_lowercase()
        };

        vec![suggestion]
    }

    /// word_segmentation divides a string into words by inserting missing spaces at the appropriate positions.
    /// word_segmentation works on text with any letter case which is retained in the output segmentation.
    /// word_segmentation works on noisy text with spelling mistakes, which are corrected in the output segmentation.
    /// existing spaces are allowed and considered for optimum segmentation.
    ///
    /// word_segmentation uses a novel approach *without* recursion.
    /// https://seekstorm.com/blog/fast-word-segmentation-noisy-text/
    /// While each string of length n can be segmentend in 2^n−1 possible compositions https://en.wikipedia.org/wiki/Composition_(combinatorics)
    /// word_segmentation has a linear runtime O(n) to find the optimum composition
    ///
    /// # Arguments
    ///
    /// * `input` - The string being segmented into words. Upper/lower case allowed.
    /// * `max_edit_distance` - The maximum edit distance between input and suggested words.
    ///
    /// # Returns
    ///
    /// * the word segmented and spelling corrected string,
    /// * The edit distance sum between input string and corrected string,
    /// * The sum of word occurence probabilities in log scale (a measure of how common and probable the corrected segmentation is).
    ///
    /// # Examples
    ///
    /// ```
    /// use symspell_rs::{SymSpell, Verbosity};
    /// use std::path::Path;
    ///
    /// let mut symspell: SymSpell = SymSpell::new(2, None, 7, 1);
    /// symspell.load_dictionary(Path::new("data/frequency_dictionary_en_82_765.txt"), 0, 1, " ");
    /// symspell.word_segmentation("itwas", 2);
    /// ```
    pub fn word_segmentation(&self, input: &str, max_edit_distance: usize) -> Composition {
        // Normalize ligatures: "scientific" "fields" "final"
        let input = &unicode_normalization_form_kc(input);

        let asize = len(input);

        let mut ci: usize = 0;
        let mut compositions: Vec<Composition> = vec![Composition::empty(); asize];

        //outer loop (column): all possible part start positions
        for j in 0..asize {
            //inner loop (row): all possible part lengths (from start position): part can't be bigger than longest word in dictionary (other than long unknown word)
            let imax = min(asize - j, self.max_dictionary_term_length);
            for i in 1..=imax {
                //get top spelling correction/ed for part
                let mut part = slice(input, j, j + i);

                let mut sep_len = 0;
                let mut top_ed = 0;

                let first_char = at(&part, 0).unwrap();
                if first_char.is_whitespace() {
                    //remove space for levensthein calculation
                    part = remove(&part, 0);
                } else {
                    //add ed+1: space did not exist, had to be inserted
                    sep_len = 1;
                }

                //remove space from part1, add number of removed spaces to topEd
                top_ed += part.len();
                //remove space
                part = part.replace(" ", "");
                top_ed -= part.len();

                // Lookup against the lowercase term
                // word_segmentation works on text with any case which is retained in the output segmentation.
                // word_segmentation works on noisy text with spelling mistakes, which are corrected in the output segmentation.
                let results =
                    self.lookup(&part, Verbosity::Top, max_edit_distance, &None, None, true);
                let top_prob_log = if !results.is_empty() {
                    //retain/preserve/transfer letter case during correction
                    if results[0].distance > 0 {
                        part = results[0].term.clone();
                        top_ed += results[0].distance;
                    }

                    //Naive Bayes Rule
                    //we assume the word probabilities of two words to be independent
                    //therefore the resulting probability of the word combination is the product of the two word probabilities

                    //instead of computing the product of probabilities we are computing the sum of the logarithm of probabilities
                    //because the probabilities of words are about 10^-10, the product of many such small numbers could exceed (underflow) the floating number range and become zero
                    //log(ab)=log(a)+log(b)
                    //todo: use decimal crate for higher precision?
                    (results[0].count as f64 / self.corpus_word_count as f64).log10()
                } else {
                    let part_len = len(&part);

                    //default, if word not found
                    //otherwise long input text would win as long unknown word (with ed=edmax+1 ), although there there should many spaces inserted
                    top_ed += part_len;
                    (10.0 / (self.corpus_word_count as f64 * 10.0f64.powf(part_len as f64))).log10()
                };

                let di = (i + ci) % asize;
                // set values in first loop
                if j == 0 {
                    compositions[i - 1] = Composition {
                        segmented_string: part.to_owned(),
                        distance_sum: top_ed,
                        prob_log_sum: top_prob_log,
                    };
                } else if i == self.max_dictionary_term_length
                    //replace values if better probabilityLogSum, if same edit distance OR one space difference 
                    || (((compositions[ci].distance_sum + top_ed == compositions[di].distance_sum)
                        || (compositions[ci].distance_sum + sep_len + top_ed
                            == compositions[di].distance_sum))
                        && (compositions[di].prob_log_sum
                            < compositions[ci].prob_log_sum + top_prob_log))
                    //replace values if smaller edit distance 
                    || (compositions[ci].distance_sum + sep_len + top_ed
                        < compositions[di].distance_sum)
                {
                    //keep punctuation or apostrophe adjacent to previous word
                    if ((part.len() == 1) && part.chars().nth(0).unwrap().is_ascii_punctuation())
                        || ((part.len() == 3) && part.starts_with(""))
                    {
                        compositions[di] = Composition {
                            segmented_string: [
                                compositions[ci].segmented_string.as_str(),
                                part.as_str(),
                            ]
                            .join(""),
                            distance_sum: compositions[ci].distance_sum + top_ed,
                            prob_log_sum: compositions[ci].prob_log_sum + top_prob_log,
                        };
                    } else {
                        //todo: keep segmented_string and corrected string separate
                        compositions[di] = Composition {
                            segmented_string: [
                                compositions[ci].segmented_string.as_str(),
                                part.as_str(),
                            ]
                            .join(" "),
                            distance_sum: compositions[ci].distance_sum + sep_len + top_ed,
                            prob_log_sum: compositions[ci].prob_log_sum + top_prob_log,
                        };
                    }
                }
            }
            if j != 0 {
                ci += 1;
            }
            ci = if ci == asize { 0 } else { ci };
        }

        if compositions.is_empty() {
            Composition {
                segmented_string: input.to_string(),
                distance_sum: 0,
                prob_log_sum: 0.0,
            }
        } else {
            compositions[ci].to_owned()
        }
    }

    // Check whether all delete chars are present in the suggestion prefix in correct order, otherwise this is just a hash collision
    fn delete_in_suggestion_prefix(
        &self,
        delete: &str,
        delete_len: usize,
        suggestion: &str,
        suggestion_len: usize,
    ) -> bool {
        if delete_len == 0 {
            return true;
        }
        let suggestion_len = if self.prefix_length < suggestion_len {
            self.prefix_length
        } else {
            suggestion_len
        };
        let mut j = 0;
        for i in 0..delete_len {
            let del_char = at(delete, i as isize).unwrap();
            while j < suggestion_len && del_char != at(suggestion, j as isize).unwrap() {
                j += 1;
            }

            if j == suggestion_len {
                return false;
            }
        }
        true
    }

    /// Create/Update an entry in the dictionary
    /// For every word there are deletes with an edit distance of 1..maxEditDistance created and added to the
    /// dictionary. Every delete entry has a suggestions list, which points to the original term(s) it was created from.
    /// The dictionary may be dynamically updated (word frequency and new words) at any time by calling CreateDictionaryEntry
    /// # Arguments
    ///
    /// * `term` - The word to add to dictionary.
    /// * `count` - The frequency count for word.
    ///
    /// Returns True if the word is added or updated and the sum count reaches or exceeds the threshold for the first time,
    /// making it a new entry valid for suggestions, otherwise False.
    pub fn create_dictionary_entry<T>(&mut self, term: T, count: usize) -> bool
    where
        T: Clone + AsRef<str> + Into<String>,
    {
        // update words
        let entry = self
            .words
            .entry(term.clone().into().into_boxed_str())
            .or_insert(0);
        if *entry == 0 {
            *entry = count;
            if count < self.count_threshold {
                return false;
            }
        } else {
            let old_count = *entry;
            let updated_count = if usize::MAX - *entry > count {
                *entry + count
            } else {
                usize::MAX
            };
            *entry = updated_count;

            if updated_count < self.count_threshold || old_count >= self.count_threshold {
                return false;
            }
        }

        // create deletes
        let term_len = len(term.as_ref());

        //do not create deletes for short words below threshold
        //todo: move to start of function: latency of len vs. words entry + count?
        if self
            .term_length_threshold
            .as_ref()
            .is_some_and(|term_length_threshold| {
                !term_length_threshold.is_empty() && term_len < term_length_threshold[0]
            })
        {
            return false;
        }

        let max_term_edit_distance = self.term_length_threshold.as_ref().map_or(
            self.max_dictionary_edit_distance,
            |term_length_threshold| {
                //max_term_edit_distance dependent on term_length_threshold
                let mut max_term_edit_distance = 0;
                for (i, threshold) in term_length_threshold.iter().enumerate() {
                    if term_len >= *threshold {
                        max_term_edit_distance = self.max_dictionary_edit_distance + i
                    } else {
                        break;
                    }
                }
                max_term_edit_distance
            },
        );

        //collect max_dictionary_term_length
        if term_len > self.max_dictionary_term_length {
            self.max_dictionary_term_length = term_len;
        }

        let edits = self.edits_prefix(term.as_ref(), max_term_edit_distance);

        for delete in edits {
            let delete_hash = hash32(delete.as_bytes());

            self.deletes
                .entry(delete_hash)
                .and_modify(|e| e.push(term.clone().into().into_boxed_str()))
                .or_insert_with(|| vec![term.clone().into().into_boxed_str()]);
        }

        true
    }

    fn edits_prefix(&self, key: &str, max_term_edit_distance: usize) -> AHashSet<String> {
        let mut hash_set = AHashSet::new();

        let key_len = len(key);

        if key_len <= self.max_dictionary_edit_distance {
            hash_set.insert("".to_string());
        }

        if key_len > self.prefix_length {
            let shortened_key = slice(key, 0, self.prefix_length);
            hash_set.insert(shortened_key.clone());
            edits(&shortened_key, 0, max_term_edit_distance, &mut hash_set);
        } else {
            hash_set.insert(key.to_string());
            edits(key, 0, max_term_edit_distance, &mut hash_set);
        };

        hash_set
    }

    fn has_different_suffix(
        &self,
        max_edit_distance: usize,
        input: &str,
        input_len: usize,
        candidate_len: usize,
        suggestion: &str,
        suggestion_len: usize,
    ) -> bool {
        // handles the shortcircuit of min_distance
        // assignment when first boolean expression
        // evaluates to false
        let min =
            if self.prefix_length as isize - max_edit_distance as isize == candidate_len as isize {
                min(input_len, suggestion_len) as isize - self.prefix_length as isize
            } else {
                0
            };

        (self.prefix_length - max_edit_distance == candidate_len)
            && (((min - self.prefix_length as isize) > 1)
                && (suffix(input, input_len + 1 - min as usize)
                    != suffix(suggestion, suggestion_len + 1 - min as usize)))
            || ((min > 0)
                && (at(input, (input_len - min as usize) as isize)
                    != at(suggestion, (suggestion_len - min as usize) as isize))
                && ((at(input, (input_len - min as usize - 1) as isize)
                    != at(suggestion, (suggestion_len - min as usize) as isize))
                    || (at(input, (input_len - min as usize) as isize)
                        != at(suggestion, (suggestion_len - min as usize - 1) as isize))))
    }
}