1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
pub(crate) mod parser;
use crate::{
alloc::{
borrow::Cow,
boxed::Box,
string::{String, ToString},
vec::Vec,
},
AffixingMode, Flag, FlagSet, AT_COMPOUND_BEGIN, AT_COMPOUND_END, FULL_WORD,
};
use core::{marker::PhantomData, num::NonZeroU16, str::Chars};
pub(crate) const HIDDEN_HOMONYM_FLAG: Flag = unsafe { Flag::new_unchecked(u16::MAX) };
pub(crate) const MAX_SUGGESTIONS: usize = 16;
macro_rules! has_flag {
( $flags:expr, $flag:expr ) => {{
match $flag {
Some(flag) => $flags.contains(&flag),
None => false,
}
}};
}
/// The representation of a flag in a `.dic` or `.aff` file.
///
/// This representation also decides how we encode flags into `Flag`. This is controlled by the
/// `FLAG` directive in a `.aff` file.
#[derive(Debug, Default, Clone, Copy)]
pub(crate) enum FlagType {
/// A single ascii character.
///
/// This is the default representation if a `.aff` file does not specify another.
#[default]
Short,
/// Two adjacent ascii characters.
///
/// The french dictionary uses this. For example for some proper nouns like `Asimov/L'D'Q'`:
/// `L'` is a flag, `D'` another, `Q'` another.
Long,
/// A number in the range `1..=65000`.
///
/// We will approximate this to `2^16`. Numeric flags are separated by commas.
/// For example `actionfilm/70,7,252,976` from the Danish dictionary.
Numeric,
/// One UTF8 character described by at most two bytes.
Utf8,
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub(crate) struct Condition {
/// The input pattern.
///
/// The condition string is not transformed or compiled into a different input. We'll iterate
/// over it directly to attempt to match the pattern.
///
/// This string is non-empty.
pattern: Box<str>,
/// The number of `char`s that the pattern describes.
///
/// `Condition` is such a small subset of regex that we can tell only from a linear scan of
/// the input how many characters we will attempt to match.
chars: usize,
}
impl Condition {
pub fn matches(&self, input: &str) -> bool {
let mut input = input.chars().peekable();
let mut pattern = self.pattern.chars().peekable();
loop {
match (pattern.next(), input.next()) {
// If we're at the end of both inputs or the pattern is shorter, this is a match.
(None, _) => return true,
(Some(_), None) => return false,
// Wildcard: skip the input character.
(Some('.'), Some(_)) => (),
// Character classes
(Some('['), Some(input_ch)) => {
let mut found = false;
let negative = pattern.next_if_eq(&'^').is_some();
for ch in pattern.by_ref() {
if ch == ']' {
break;
}
if ch == input_ch {
found = true;
}
}
// If it's a positive character class and the character isn't a member,
// this is not a match.
if !negative && !found {
return false;
}
// If it's a negative character class and the character _is_ a member,
// this is not a match.
if negative && found {
return false;
}
}
// Literals: the pattern character must equal the input character.
(Some(pattern_ch), Some(input_ch)) => {
if pattern_ch != input_ch {
return false;
}
}
}
}
}
}
/// Internal container type for a prefix or suffix.
#[derive(Debug, PartialEq, Eq, Clone)]
pub(crate) struct Affix<K> {
/// The flag that words may use to reference this affix.
pub flag: Flag,
/// Whether the affix is compatible with the opposite affix. For example a word that has both
/// a prefix and a suffix, both the prefix and suffix should have `crossproduct: true`.
pub crossproduct: bool,
/// What is stripped from the stem when the affix is applied.
pub strip: Option<Box<str>>,
/// What should be added when the affix is applied.
pub add: Box<str>,
/// Condition that the stem should be checked against to query if the affix is relevant.
///
/// This is optional in Spellbook. Hunspell and Nuspell represent what we say is `None` as
/// `"."`. It's a pattern that always matches the input since the input to `condition_matches`
/// is never empty.
condition: Option<Condition>,
/// Continuation flags.
///
/// These are included with the `add` in `.aff` files (separated by `/`).
// TODO: document how they're used.
pub flags: FlagSet,
phantom_data: PhantomData<K>,
}
impl<K: AffixKind> Affix<K> {
pub fn new(
flag: Flag,
crossproduct: bool,
strip: Option<&str>,
add: &str,
condition: Option<&str>,
flags: FlagSet,
) -> Result<Self, parser::ConditionError> {
let condition = condition.map(str::parse).transpose()?;
Ok(Self {
flag,
crossproduct,
strip: strip.map(|str| str.into()),
add: add.into(),
flags,
condition,
phantom_data: PhantomData,
})
}
pub fn appending(&self) -> K::Chars<'_> {
K::chars(&self.add)
}
#[inline]
pub fn is_modifying(&self) -> bool {
self.strip.is_some() || !self.add.is_empty()
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(crate) struct Pfx;
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(crate) struct Sfx;
/// Rules for replacing characters at the beginning of a stem.
pub(crate) type Prefix = Affix<Pfx>;
/// Rules for replacing characters at the end of a stem.
pub(crate) type Suffix = Affix<Sfx>;
/// A helper trait that, together with `Pfx` and `Sfx`, allows generically reading either
/// characters of a `&str` forwards or backwards.
///
/// This is a textbook ["lending iterator"] which uses a generic associated type to express that
/// the lifetime of the iterator is bound only to the input word.
///
/// ["lending iterator"]: https://rust-lang.github.io/generic-associated-types-initiative/design_patterns/iterable.html
pub(crate) trait AffixKind {
type Chars<'a>: Iterator<Item = char>
where
Self: 'a;
fn chars(word: &str) -> Self::Chars<'_>;
// Reversed form of `affix_NOT_valid` from Nuspell.
fn is_valid<const MODE: AffixingMode>(affix: &Affix<Self>, options: &AffOptions) -> bool
where
Self: Sized;
}
impl AffixKind for Pfx {
type Chars<'a> = Chars<'a>;
fn chars(word: &str) -> Self::Chars<'_> {
word.chars()
}
fn is_valid<const MODE: AffixingMode>(prefix: &Prefix, options: &AffOptions) -> bool {
if MODE == FULL_WORD && has_flag!(prefix.flags, options.only_in_compound_flag) {
return false;
}
if MODE == AT_COMPOUND_END && !has_flag!(prefix.flags, options.compound_permit_flag) {
return false;
}
if MODE != FULL_WORD && has_flag!(prefix.flags, options.compound_forbid_flag) {
return false;
}
true
}
}
impl AffixKind for Sfx {
type Chars<'a> = core::iter::Rev<Chars<'a>>;
fn chars(word: &str) -> Self::Chars<'_> {
word.chars().rev()
}
fn is_valid<const MODE: AffixingMode>(suffix: &Suffix, options: &AffOptions) -> bool {
if MODE == FULL_WORD && has_flag!(suffix.flags, options.only_in_compound_flag) {
return false;
}
if MODE == AT_COMPOUND_BEGIN && !has_flag!(suffix.flags, options.compound_permit_flag) {
return false;
}
if MODE != FULL_WORD && has_flag!(suffix.flags, options.compound_forbid_flag) {
return false;
}
true
}
}
impl Prefix {
/// Converts a word which starts with this `Prefix` to the word's stem.
///
/// The prefix's `add` is removed from the beginning and replaced with the `strip`.
///
/// Nuspell calls this `to_root`.
///
/// # Panics
///
/// This function `expect`s that the `Prefix`'s `add` is a prefix of the input `word`.
pub fn to_stem<'a>(&self, word: &'a str) -> Cow<'a, str> {
let stripped = word
.strip_prefix(&*self.add)
.expect("to_stem should only be called when the `add` is a prefix of the word");
match &self.strip {
Some(strip) => {
let mut stem = strip.to_string();
stem.push_str(stripped);
Cow::Owned(stem)
}
None => Cow::Borrowed(stripped),
}
}
/// Converts a stem into a word starting with this `Prefix`.
///
/// This prefix's `strip` is removed from the beginning and replaced with the `add`. This is
/// the inverse of `Prefix::to_stem`.
///
/// Nuspell calls this `to_derived.`
///
/// # Panics
///
/// This function `expect`s that the given `word` starts with this `Prefix`'s `strip`, if this
/// prefix has a `strip`.
pub fn to_derived(&self, word: &str) -> String {
let stripped = match &self.strip {
Some(strip) => word
.strip_prefix(&**strip)
.expect("to_derived should only be called when `strip` is a prefix of the word"),
None => word,
};
let mut stem = self.add.to_string();
stem.push_str(stripped);
stem
}
pub fn condition_matches(&self, word: &str) -> bool {
let condition = match self.condition.as_ref() {
Some(condition) => condition,
None => return true,
};
// Length in bytes is greater than or equal to length in chars.
if word.len() < condition.chars {
return false;
}
condition.matches(word)
}
}
impl Suffix {
/// Converts a word which ends with this `Suffix` to the word's stem.
///
/// This suffix's `add` is removed from the end and replaced with the `strip`.
///
/// Nuspell calls this `to_root`.
///
/// # Panics
///
/// This function `expect`s that the `Suffix`'s `add` is a suffix of the input `word`.
pub fn to_stem<'a>(&self, word: &'a str) -> Cow<'a, str> {
let stripped = word
.strip_suffix(&*self.add)
.expect("to_stem should only be called when the `add` is a suffix of the word");
match self.strip.as_deref() {
Some(strip) => {
let mut stem = stripped.to_string();
stem.push_str(strip);
Cow::Owned(stem)
}
None => Cow::Borrowed(stripped),
}
}
/// Converts a stem into a word starting with this `Suffix`.
///
/// This suffix's `strip` is removed from the end and replaced with the `add`. This is
/// the inverse of `Suffix::to_stem`.
///
/// Nuspell calls this `to_derived.`
///
/// # Panics
///
/// This function `expect`s that the given `word` ends with this `Suffix`'s `strip`, if this
/// suffix has a `strip`.
pub fn to_derived(&self, word: &str) -> String {
let mut stem = match &self.strip {
Some(strip) => word
.strip_suffix(&**strip)
.expect("to_derived should only be called when `strip` is a prefix of the word"),
None => word,
}
.to_string();
stem.push_str(&self.add);
stem
}
pub fn condition_matches(&self, word: &str) -> bool {
let condition = match self.condition.as_ref() {
Some(condition) => condition,
None => return true,
};
// Length in bytes is greater than or equal to length in chars.
let len_bytes = word.len();
if len_bytes < condition.chars {
return false;
}
let (chars, bytes) = word
.char_indices()
.rev()
.take(condition.chars)
.fold((0, 0), |(chars, _bytes), (byte_index, _ch)| {
(chars + 1, len_bytes - byte_index)
});
if chars < condition.chars {
return false;
}
condition.matches(&word[word.len() - bytes..])
}
}
pub(crate) type PrefixIndex = AffixIndex<Pfx>;
pub(crate) type SuffixIndex = AffixIndex<Sfx>;
/// A data structure for looking up any affixes which might match a given word.
///
/// The `AffixIndex` is one of two central data structures, along with the `WordList`. It
/// functions very similarly to a [radix tree], allowing efficient lookup of prefix or suffix
/// rules.
///
/// For example a prefix from `en_US.aff` for "re":
///
/// ```text
/// PFX A Y 1
/// PFX A 0 re .
/// ```
///
/// That prefix strips nothing (`0`) from the beginning and adds "re" to the beginning of any
/// words it is applied to.
///
/// For prefixes, `affixes_of` returns an iterator over all of the `Prefix`es in the table which
/// have an `add` field which is a prefix of the search word.
///
/// This structure also searches from the end of the word when looking up suffixes. A suffix from
/// `en_US.aff`:
///
/// ```text
/// SFX D Y 4
/// SFX D 0 d e
/// SFX D y ied [^aeiou]y
/// SFX D 0 ed [^ey]
/// SFX D 0 ed [aeiou]y
/// ```
///
/// Any word in the word list with the "D" flag can try to apply these suffixes. For a word like
/// "aced," `affixes_of` would return the first, third and fourth suffixes, as `d`, `ed` and `ed`
/// are suffixes of "aced," but not the second (`ied`).
///
/// Internally this type is implemented using a sorted `Vec` of affixes - one table for prefixes
/// and one for suffixes. Iterating with `affixes_of` first emits all affixes with empty `add`
/// text. Then we look at the first character in the search string. We can constrain our search
/// to only the elements in the table that start with that character using a precomputed index
/// of characters to indices within the table. After considering the first character, we use
/// linear searches of the remaining table slice to constrain the search for each next character
/// in the search key.
///
/// [radix tree]: https://en.wikipedia.org/wiki/Radix_tree
// TODO: I originally tried a hashing-based approach using `hashbrown::raw::RawTable`. Lift that
// structure from the commit history and benchmark it against this Vec based approach.
#[derive(Debug, Clone)]
pub(crate) struct AffixIndex<C> {
table: Box<[Affix<C>]>,
first_char: Box<[char]>,
prefix_idx_with_first_char: Box<[usize]>,
pub all_flags: FlagSet,
}
impl<C: AffixKind> FromIterator<Affix<C>> for AffixIndex<C> {
fn from_iter<T: IntoIterator<Item = Affix<C>>>(iter: T) -> Self {
let table: Vec<_> = iter.into_iter().collect();
table.into()
}
}
impl<C: AffixKind> From<Vec<Affix<C>>> for AffixIndex<C> {
fn from(mut table: Vec<Affix<C>>) -> Self {
// Sort the table lexiographically by key. We will use this lexiographical ordering to
// efficiently search in AffixesIter.
table.sort_unstable_by(|a, b| a.appending().cmp(b.appending()));
let mut first_char = Vec::new();
let mut prefix_idx_with_first_char = Vec::new();
// Seek through the sorted table to the first element where the key is non-empty.
let mut first_char_idx = table.partition_point(|affix| affix.appending().next().is_none());
while first_char_idx < table.len() {
let ch = table[first_char_idx]
.appending()
.next()
.expect("vec is sorted so empty keys are before the partition point");
// Save the first character of the key and the index of the affix in the table that
// starts off this character. We can use this while reading the AffixIndex to jump
// ahead efficiently in the table.
first_char.push(ch);
prefix_idx_with_first_char.push(first_char_idx);
match table[first_char_idx..].iter().position(|affix| {
affix
.appending()
.next()
.expect("vec is sorted so empty keys are before the partition point")
> ch
}) {
Some(next_char_index) => first_char_idx += next_char_index,
None => break,
}
}
// Add an extra element to the end so that `prefix_idx_with_first_char` is always one
// element longer than `first_char`.
prefix_idx_with_first_char.push(table.len());
let flags = table
.iter()
.flat_map(|affix| affix.flags.iter().copied())
.collect::<Vec<Flag>>()
.into();
Self {
table: table.into(),
first_char: first_char.into(),
prefix_idx_with_first_char: prefix_idx_with_first_char.into(),
all_flags: flags,
}
}
}
impl<C: AffixKind> AffixIndex<C> {
pub fn affixes_of<'index, 'word>(
&'index self,
word: &'word str,
) -> AffixesIter<'index, 'word, C> {
AffixesIter {
table: &self.table,
first_char: &self.first_char,
prefix_idx_with_first_char: &self.prefix_idx_with_first_char,
chars: C::chars(word),
chars_matched: 0,
}
}
pub fn len(&self) -> usize {
self.table.len()
}
pub fn iter(&self) -> core::slice::Iter<Affix<C>> {
self.table.iter()
}
}
/// An iterator over the prefixes/suffixes of a given word.
pub(crate) struct AffixesIter<'index, 'word, C: AffixKind + 'static> {
table: &'index [Affix<C>],
first_char: &'index [char],
prefix_idx_with_first_char: &'index [usize],
chars: C::Chars<'word>,
chars_matched: usize,
}
impl<'index, C: AffixKind> Iterator for AffixesIter<'index, '_, C> {
type Item = &'index Affix<C>;
fn next(&mut self) -> Option<Self::Item> {
// TODO: revisit this type. I suspect it's faster to just use `str::starts_with` and
// `str::ends_with` rather than char iterators. Also using char iterators shouldn't
// be necessary I think - we could switch to bytes instead.
// Return all affixes that append nothing first.
if self.chars_matched == 0 {
if self.table.is_empty() {
return None;
}
let item = &self.table[0];
if item.appending().next().is_some() {
// The empty portion of the table is done.
// Scan ahead to where the first character is.
let ch = self.chars.next()?;
let first_char_idx = self.first_char.iter().position(|c| *c == ch)?;
// NOTE: `prefix_idx_with_first_char` always has at least one element and is
// always one element longer than `first_char`, so we can safely index at `0`
// and at whatever index we get from `first_char` plus one.
let empty_offset = self.prefix_idx_with_first_char[0];
// Constrain the bounds of the search to affixes that share the first letter
// of the key. Offset by the number of affixes with empty `add` that we emitted
// previously.
let start = self.prefix_idx_with_first_char[first_char_idx] - empty_offset;
let end = self.prefix_idx_with_first_char[first_char_idx + 1] - empty_offset;
self.table = &self.table[start..end];
self.chars_matched = 1;
} else {
self.table = &self.table[1..];
return Some(item);
}
}
loop {
if self.table.is_empty() {
return None;
}
// If the search key is exactly matched so far (up to the number of characters we've
// seen), emit the item.
let item = &self.table[0];
if item.appending().count() == self.chars_matched {
self.table = &self.table[1..];
return Some(item);
}
// Look at the next character in the search key. Limit the search to the slice of
// the table where the nth character for each affix matches this character of the
// search key.
let ch = self.chars.next()?;
// Move `start` up to the index of the first affix that has this character in its
// nth position.
let char_beginning_idx = self
.table
.iter()
.position(|affix| affix.appending().nth(self.chars_matched) == Some(ch))?;
self.table = &self.table[char_beginning_idx..];
// Move the `end` back so that the last element in the search slice is the last
// affix that shares this character in its nth position.
let char_end_idx = self
.table
.partition_point(|affix| affix.appending().nth(self.chars_matched) == Some(ch));
self.table = &self.table[..char_end_idx];
self.chars_matched += 1;
}
}
}
/// A collection of patterns used to break words into smaller words.
///
/// This is internally represented with a single `table` which is partitioned into three sections:
/// one for patterns that apply at the beginning of words, one for patterns that can apply
/// anywhere in the middle of a word, and one for patterns that must apply to the end of a word.
///
// TODO: document how breaks are used and what the patterns mean.
#[derive(Debug, Clone)]
pub(crate) struct BreakTable {
table: Box<[Box<str>]>,
start_word_breaks_last_idx: usize,
// Nuspell keeps the entries partitioned in the order "start, end, middle." I've re-arranged
// this to be "start, middle, end" since I think it's more natural.
middle_word_breaks_last_idx: usize,
}
impl BreakTable {
fn new(breaks: &[&str]) -> Self {
let mut start = Vec::new();
let mut middle = Vec::new();
let mut end = Vec::new();
for &b in breaks.iter() {
// Break patterns are parsed in a way that ensures they are non-empty.
assert!(!b.is_empty());
if let Some(b) = b.strip_prefix('^') {
start.push(b.into());
} else if let Some(b) = b.strip_suffix('$') {
end.push(b.into());
} else {
middle.push(b.into());
}
}
let mut table = start;
let start_word_breaks_last_idx = table.len();
table.append(&mut middle);
let middle_word_breaks_last_idx = table.len();
table.append(&mut end);
Self {
table: table.into_boxed_slice(),
start_word_breaks_last_idx,
middle_word_breaks_last_idx,
}
}
#[inline]
pub fn start_word_breaks(&self) -> impl Iterator<Item = &str> {
self.table[..self.start_word_breaks_last_idx]
.iter()
.map(AsRef::as_ref)
}
#[inline]
pub fn middle_word_breaks(&self) -> impl Iterator<Item = &str> {
self.table[self.start_word_breaks_last_idx..self.middle_word_breaks_last_idx]
.iter()
.map(AsRef::as_ref)
}
#[inline]
pub fn end_word_breaks(&self) -> impl Iterator<Item = &str> {
self.table[self.middle_word_breaks_last_idx..]
.iter()
.map(AsRef::as_ref)
}
}
/// An ordered sequence of replacement pairs.
///
/// This is very similar to break patterns both in terms of the language used to describe them
/// in aff files and in representation. It's an ordered sequence with the replacements that only
/// apply:
///
/// 1. To entire words.
/// 2. At the beginning of words.
/// 3. At the end of words.
/// 4. Anywhere in the word.
///
/// Otherwise though it's basically a `Vec<(String, String)>`.
#[derive(Debug, Clone)]
pub(crate) struct ReplacementTable {
table: Box<[(Box<str>, Box<str>)]>,
whole_word_replacements_last_idx: usize,
start_word_replacements_last_idx: usize,
end_word_replacements_last_idx: usize,
}
impl From<Vec<(&str, String)>> for ReplacementTable {
fn from(replacements: Vec<(&str, String)>) -> Self {
let mut whole = Vec::new();
let mut start = Vec::new();
let mut end = Vec::new();
let mut anywhere = Vec::new();
for (from, to) in replacements.into_iter() {
// Replacements are parsed in a way that ensures they are non-empty.
assert!(!from.is_empty() && !to.is_empty());
if let Some(from) = from.strip_prefix('^') {
if let Some(from) = from.strip_suffix('$') {
// Starts with '^' and ends with '$' matches the whole word.
// Seems to be rarely if ever used in practice.
whole.push((from.into(), to.into()));
} else {
// Only starts with '^' - applies to the start of a word.
start.push((from.into(), to.into()));
}
} else if let Some(from) = from.strip_suffix('$') {
// Only ends with '$' - applies to the end of a word.
end.push((from.into(), to.into()));
} else {
// Doesn't have an anchor - applies anywhere in the word.
anywhere.push((from.into(), to.into()));
}
}
let mut table = whole;
let whole_word_replacements_last_idx = table.len();
table.append(&mut start);
let start_word_replacements_last_idx = table.len();
table.append(&mut end);
let end_word_replacements_last_idx = table.len();
table.append(&mut anywhere);
Self {
table: table.into_boxed_slice(),
whole_word_replacements_last_idx,
start_word_replacements_last_idx,
end_word_replacements_last_idx,
}
}
}
impl ReplacementTable {
#[inline]
pub fn whole_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
self.table[..self.whole_word_replacements_last_idx]
.iter()
.map(|(from, to)| (from.as_ref(), to.as_ref()))
}
#[inline]
pub fn start_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
self.table[self.whole_word_replacements_last_idx..self.start_word_replacements_last_idx]
.iter()
.map(|(from, to)| (from.as_ref(), to.as_ref()))
}
#[inline]
pub fn end_word_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
self.table[self.start_word_replacements_last_idx..self.end_word_replacements_last_idx]
.iter()
.map(|(from, to)| (from.as_ref(), to.as_ref()))
}
#[inline]
pub fn any_place_replacements(&self) -> impl Iterator<Item = (&str, &str)> {
self.table[self.end_word_replacements_last_idx..]
.iter()
.map(|(from, to)| (from.as_ref(), to.as_ref()))
}
/// Whether the table only has whole word replacements.
#[inline]
pub fn has_only_whole_word_replacements(&self) -> bool {
self.whole_word_replacements_last_idx == self.table.len()
}
}
/// Individual rules of COMPOUNDRULE patterns.
///
/// Compound rules are a very small regex-like language for describing how stems might be joined
/// in a compound. Each element might be a flag, a zero-or-one wildcard (`?`) or a zero-or-more
/// wildcard (`*`). Typically dictionaries use these to describe how to compound numbers
/// together. The [Spylls docs for `CompoundRule`](https://spylls.readthedocs.io/en/latest/hunspell/data_aff.html?highlight=compound%20rule#spylls.hunspell.data.aff.CompoundRule)
/// have an excellent explanation of a common use-case for compound rules.
///
/// # Representation
///
/// Nuspell doesn't special case `*` and `?` modifiers. It stores the entire rule as a string
/// of `char16_t` (which is also Nuspell flag type). This is quite clever because it allows
/// Nuspell to only spend two bytes per element to store the rule. `CompoundRuleElement` is
/// 4 bytes in comparison. The tradeoff is ambiguity for some `FlagType` representations and more
/// complicated matching code. If a `.aff` file used `FlagType::Numeric`, `*` would be
/// indistinguishable from 42 and `?` indistinguishable from 63. In practice this doesn't seem to
/// be a problem. Nuspell looks ahead in the rule string to find wildcards when matching which is
/// not much more work but is more complicated to understand.
///
/// We use a `[CompoundRuleElement]` in Spellbook only for clarity. Few dictionaries use
/// compound rules and those that do use them tend to use 12 or fewer entries in the table, with
/// each rule being only a few elements long.
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct CompoundRuleElement {
pub flag: Flag,
pub modifier: Option<CompoundRuleModifier>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum CompoundRuleModifier {
ZeroOrOne,
ZeroOrMore,
}
type CompoundRule = Box<[CompoundRuleElement]>;
fn compound_rule_matches(pattern: &[CompoundRuleElement], data: &[&FlagSet]) -> bool {
use crate::alloc::vec;
use CompoundRuleModifier::*;
// TODO: try reimplementing with recursion instead of a Vec-based stack?
let mut stack = vec![(0, 0)];
while let Some((pattern_idx, data_idx)) = stack.pop() {
if pattern_idx == pattern.len() {
return data_idx == data.len();
}
// Nuspell does this a little differently. As mentioned in the `CompoundRuleElement` docs
// they don't preprocess the rules so the wildcards remain in the strings as `*`/`?`
// characters. It looks ahead (`pattern_idx + 1`) to find the modifier and when it skips
// over the modifier it jumps ahead by 2. We jump ahead by one below and use the modifier
// we parsed instead.
let flag_matches = match data.get(data_idx) {
Some(flagset) => flagset.contains(&pattern[pattern_idx].flag),
None => false,
};
match pattern[pattern_idx].modifier {
Some(ZeroOrOne) => {
// Try as if the flag didn't match (allowed since the pattern may match 0 times).
stack.push((pattern_idx + 1, data_idx));
if flag_matches {
// Try as if it did match: consume the pattern token and move on.
stack.push((pattern_idx + 1, data_idx + 1));
}
}
Some(ZeroOrMore) => {
// Try as if the flag didn't match (allowed since the pattern may match 0 times).
stack.push((pattern_idx + 1, data_idx));
if flag_matches {
// Try as if it did match. Don't consume the pattern token because it can
// match infinitely.
stack.push((pattern_idx, data_idx + 1));
}
}
None => {
// Without a modifier, only continue searching if the flag matched.
if flag_matches {
stack.push((pattern_idx + 1, data_idx + 1));
}
}
}
}
false
}
/// A set of rules that can be used to detect whether constructed compounds are allowed.
///
/// TODO: talk about wildcards, show a compounding example.
#[derive(Debug, Clone)]
pub(crate) struct CompoundRuleTable {
rules: Box<[CompoundRule]>,
all_flags: FlagSet,
}
impl From<Vec<CompoundRule>> for CompoundRuleTable {
fn from(rules: Vec<CompoundRule>) -> Self {
let all_flags: Vec<_> = rules
.iter()
.flat_map(|rule| rule.iter().map(|el| el.flag))
.collect();
Self {
rules: rules.into_boxed_slice(),
all_flags: all_flags.into(),
}
}
}
impl CompoundRuleTable {
#[inline]
pub fn is_empty(&self) -> bool {
self.rules.is_empty()
}
/// Checks whether the given flagset has any flags in common with flags used in any compound
/// rule.
#[inline]
pub fn has_any_flags(&self, flagset: &FlagSet) -> bool {
self.all_flags.has_intersection(flagset)
}
/// Checks whether any rule in the compound table matches the sequence of flagsets.
///
/// Also see Checker's `check_compound_with_rules_impl`. This function is passed a possible
/// interpretation of a compound. Consider "10th" and the en_US dictionary. The checker might
/// split up the word to be "1/n1" (L4) and "0th/pt" (L3). So this function would be passed
/// `&[flagset!['n', '1'], flagset!['p', 't']]`. The matching logic in compound_rule_matches
/// uses `FlagSet::contains` on these flagsets to look up whether the flagset matches the
/// pattern. en_US defines the compounding for this as `n*1t` - `n` being a flag on any single
/// digit, `*` meaning as many of those as you want (including none) and then a required `1`
/// flag and `t` flag - which match the input. compound_rule_matches tries different
/// interpretations of the modifiers to check if the pattern matches.
pub fn any_rule_matches(&self, flagsets: &[&FlagSet]) -> bool {
self.rules
.iter()
.any(|rule| compound_rule_matches(rule, flagsets))
}
}
/// Storage for two strings within the allocation of one `Box<str>`.
#[derive(Debug, Clone)]
pub(crate) struct StrPair {
inner: Box<str>,
/// The `.len()` of the first string: the index of the partition between the first and
/// second string.
partition: usize,
}
impl StrPair {
pub fn new(left: &str, right: &str) -> Self {
let mut string = left.to_string();
let partition = string.len();
string.push_str(right);
Self {
inner: string.into(),
partition,
}
}
#[inline]
pub fn full_str(&self) -> &str {
&self.inner
}
#[inline]
pub fn partition(&self) -> usize {
self.partition
}
}
#[derive(Debug, Clone)]
pub(crate) struct CompoundPattern {
pub begin_end_chars: StrPair,
// TODO: unimplemented. See Checker::check_compound_with_pattern_replacements.
#[allow(dead_code)]
pub replacement: Option<Box<str>>,
pub first_word_flag: Option<Flag>,
pub second_word_flag: Option<Flag>,
pub match_first_only_unaffixed_or_zero_affixed: bool,
}
#[derive(Debug, Clone)]
struct Conversion {
from: Box<str>,
to: Box<str>,
/// ICONV/OCONV use '_' at the end of the pattern text like '$' in regex: if the match only
/// applies at the end of the word.
anchor_end: bool,
}
impl Conversion {
fn find(&self, word: &str) -> Option<usize> {
if word.len() < self.from.len() {
return None;
}
if self.anchor_end {
word.ends_with(&*self.from)
.then_some(word.len() - self.from.len())
} else {
word.find(&*self.from)
}
}
}
/// The conversion table used by ICONV and OCONV rules.
///
/// This is nothing more than a sequence of `(from, to)` replacement pairs. Not many dictionaries
/// use this rule. en_US and a few others use it to replace magic apostrophes "’" with regular
/// ones. Others like french have quite a few rules to normalize similar looking and meaning
/// unicode representations of letters, like "à" becoming "à".
#[derive(Debug, Clone)]
pub(crate) struct ConversionTable {
inner: Box<[Conversion]>,
}
impl From<Vec<(&str, &str)>> for ConversionTable {
fn from(mut table: Vec<(&str, &str)>) -> Self {
// Sort the table so that longer patterns come before shorter.
table.sort_unstable_by_key(|(from, _to)| core::cmp::Reverse(*from));
Self {
inner: table
.into_iter()
.map(|(from, to)| {
let (from, anchor_end) = if let Some(from) = from.strip_suffix('_') {
(from, true)
} else {
(from, false)
};
Conversion {
to: to.into(),
from: from.into(),
anchor_end,
}
})
.collect(),
}
}
}
/// A match of an ICONV/OCONV pattern in a word.
/// The lifetime belongs to the table.
#[derive(Debug)]
struct ConversionMatch<'a> {
/// The byte index in the word at which the match starts.
start: usize,
/// The pattern which matched.
from: &'a str,
/// The text that should be used to replace the pattern.
replacement: &'a str,
}
impl ConversionTable {
/// Finds the earliest conversion which matches the given word at the offset with the longest
/// replacement length.
fn find_match<'a>(&'a self, word: &str, offset: usize) -> Option<ConversionMatch<'a>> {
let mut mat: Option<ConversionMatch> = None;
for conversion in self.inner.iter() {
let start = match conversion.find(&word[offset..]) {
Some(idx) => idx + offset,
None => continue,
};
// Favor the longest pattern which starts at the earliest byte.
if mat.is_none()
|| mat.as_ref().is_some_and(|mat| {
start < mat.start
|| (start == mat.start && conversion.from.len() > mat.from.len())
})
{
mat = Some(ConversionMatch {
start,
from: &conversion.from,
replacement: &conversion.to,
})
}
}
mat
}
/// Applies any ICONV/OCONV conversions to the given word.
///
/// Longer patterns are replaced before shorter ones.
pub fn convert<'a>(&self, word: &'a str) -> Cow<'a, str> {
let mut word = Cow::Borrowed(word);
let mut i = 0;
while let Some(conversion) = self.find_match(&word, i) {
let mut string = word.into_owned();
// Adjust for finding a match within `&word[i..]`
let range = conversion.start..conversion.start + conversion.from.len();
string.replace_range(range, conversion.replacement);
word = Cow::Owned(string);
i = conversion.start + conversion.replacement.len();
}
word
}
}
#[derive(Debug, Default, Clone, Copy)]
pub(crate) enum CaseHandling {
/// Turkish, Azerbaijani and Crimean Tartar. These locales require some extra replacement
/// logic for i-like characters.
Turkic,
/// Anything else. These locales can use the standard library's case mapping functions.
#[default]
Standard,
}
impl CaseHandling {
// Casing primitives.
// The standard library covers us most of the way but there's a locale-specific casing rule
// that applies to Turkic locales: Turkish, Azerbaijani and Crimean Tartar. In Turkic
// locales, "i" is uppercased as "İ" and "I" is lowercased as "ı".
// TODO: optimize. See the standard library's optimizations.
pub fn lowercase(&self, word: &str) -> String {
match self {
Self::Turkic => word.replace('I', "ı").replace('İ', "i").to_lowercase(),
Self::Standard => word.to_lowercase(),
}
}
pub fn uppercase(&self, word: &str) -> String {
match self {
Self::Turkic => word.replace('i', "İ").replace('ı', "I").to_uppercase(),
Self::Standard => word.to_uppercase(),
}
}
pub fn titlecase(&self, word: &str) -> String {
let mut output = String::with_capacity(word.len());
let mut chars = word.chars();
let first = chars.next().expect("non-empty input");
match self {
Self::Turkic if first == 'i' => output.push('İ'),
Self::Turkic if first == 'ı' => output.push('I'),
_ => output.extend(first.to_uppercase()),
}
for ch in chars {
match self {
Self::Turkic if ch == 'I' => output.push('ı'),
Self::Turkic if ch == 'İ' => output.push('i'),
_ => output.extend(ch.to_lowercase()),
}
}
output
}
pub fn lower_first_char(&self, word: &str) -> String {
let mut output = String::with_capacity(word.len());
let mut chars = word.char_indices();
let (_, first) = chars.next().expect("non-empty input");
match self {
Self::Turkic if first == 'I' => output.push('ı'),
Self::Turkic if first == 'İ' => output.push('i'),
_ => output.extend(first.to_lowercase()),
}
if let Some((idx, _)) = chars.next() {
output.push_str(&word[idx..]);
}
output
}
pub fn upper_char_at(&self, word: &str, idx: usize) -> String {
let mut output = String::with_capacity(word.len());
output.push_str(&word[..idx]);
let mut chars = word[idx..].char_indices();
let (_, ch) = chars.next().expect("a char at the given index");
match self {
Self::Turkic if ch == 'ı' => output.push('I'),
Self::Turkic if ch == 'i' => output.push('İ'),
_ => output.extend(ch.to_uppercase()),
}
if let Some((next_idx, _)) = chars.next() {
output.push_str(&word[idx + next_idx..]);
}
output
}
/// Checks whether `left` is equal to `right` when `right` is lowercase.
pub fn is_char_eq_lowercase(&self, left: char, right: char) -> bool {
match (self, left, right) {
(Self::Turkic, 'ı', 'I') => return true,
(Self::Turkic, 'i', 'İ') => return true,
_ => (),
}
let mut lower_iter = right.to_lowercase();
if lower_iter.len() != 1 {
return false;
}
let lower = lower_iter.next().unwrap();
left == lower
}
}
#[derive(Debug, Clone)]
pub(crate) struct SimilarityGroup {
pub chars: Box<str>,
pub strings: Box<[Box<str>]>,
}
#[derive(Debug, Clone)]
pub(crate) struct AffData {
// checking options
pub prefixes: PrefixIndex,
pub suffixes: SuffixIndex,
pub break_table: BreakTable,
pub compound_rules: CompoundRuleTable,
pub compound_syllable_vowels: Box<str>,
pub compound_patterns: Box<[CompoundPattern]>,
pub input_conversions: ConversionTable,
pub output_conversions: ConversionTable,
// locale TODO
// suggestion options
pub replacements: ReplacementTable,
pub similarities: Box<[SimilarityGroup]>,
// phonetic_table: PhoneticTable,
pub ignore_chars: Box<[char]>,
pub keyboard_closeness: Box<str>,
pub try_chars: Box<str>,
pub options: AffOptions,
// Parsing options. These are preserved so that we can re-use them in `Dictionary::add`.
pub flag_type: FlagType,
pub flag_aliases: Box<[FlagSet]>,
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct AffOptions {
pub complex_prefixes: bool,
pub fullstrip: bool,
pub checksharps: bool,
pub forbid_warn: bool,
pub only_in_compound_flag: Option<Flag>,
pub circumfix_flag: Option<Flag>,
pub forbidden_word_flag: Flag,
pub keep_case_flag: Option<Flag>,
pub need_affix_flag: Option<Flag>,
pub warn_flag: Option<Flag>,
// compounding options
pub compound_flag: Option<Flag>,
pub compound_begin_flag: Option<Flag>,
pub compound_middle_flag: Option<Flag>,
pub compound_end_flag: Option<Flag>,
// These `Option<NonZeroU16>`s represent counts or sizes and a zero value isn't accepted.
// Being the same as a flag's representation is coincidence.
pub compound_min_length: Option<NonZeroU16>,
pub compound_max_word_count: Option<NonZeroU16>,
pub compound_permit_flag: Option<Flag>,
pub compound_forbid_flag: Option<Flag>,
pub compound_root_flag: Option<Flag>,
pub compound_force_uppercase_flag: Option<Flag>,
pub compound_more_suffixes: bool,
pub compound_check_duplicate: bool,
pub compound_check_rep: bool,
pub compound_check_case: bool,
pub compound_check_triple: bool,
pub compound_simplified_triple: bool,
pub compound_syllable_num: bool,
pub compound_syllable_max: Option<NonZeroU16>,
pub max_compound_suggestions: u16,
pub no_suggest_flag: Option<Flag>,
pub substandard_flag: Option<Flag>,
pub max_ngram_suggestions: u16,
pub max_diff_factor: u16,
pub only_max_diff: bool,
pub no_split_suggestions: bool,
pub suggest_with_dots: bool,
pub case_handling: CaseHandling,
}
impl Default for AffOptions {
fn default() -> Self {
Self {
// Hunspell:
// // default flags
// #define DEFAULTFLAGS 65510
// #define FORBIDDENWORD 65510
// #define ONLYUPCASEFLAG 65511
complex_prefixes: Default::default(),
fullstrip: Default::default(),
checksharps: Default::default(),
forbid_warn: Default::default(),
only_in_compound_flag: Default::default(),
circumfix_flag: Default::default(),
forbidden_word_flag: Flag::new(65510).unwrap(),
keep_case_flag: Default::default(),
need_affix_flag: Default::default(),
warn_flag: Default::default(),
compound_flag: Default::default(),
compound_begin_flag: Default::default(),
compound_middle_flag: Default::default(),
compound_end_flag: Default::default(),
compound_min_length: Default::default(),
compound_max_word_count: Default::default(),
compound_permit_flag: Default::default(),
compound_forbid_flag: Default::default(),
compound_root_flag: Default::default(),
compound_force_uppercase_flag: Default::default(),
compound_more_suffixes: Default::default(),
compound_check_duplicate: Default::default(),
compound_check_rep: Default::default(),
compound_check_case: Default::default(),
compound_check_triple: Default::default(),
compound_simplified_triple: Default::default(),
compound_syllable_num: Default::default(),
compound_syllable_max: Default::default(),
max_compound_suggestions: 3,
no_suggest_flag: Default::default(),
substandard_flag: Default::default(),
max_ngram_suggestions: 5,
max_diff_factor: 5,
only_max_diff: Default::default(),
no_split_suggestions: Default::default(),
suggest_with_dots: Default::default(),
case_handling: Default::default(),
}
}
}
impl AffOptions {
pub fn allows_compounding(&self) -> bool {
self.compound_flag.is_some()
|| self.compound_begin_flag.is_some()
|| self.compound_middle_flag.is_some()
|| self.compound_end_flag.is_some()
}
}
#[cfg(test)]
mod test {
use super::*;
macro_rules! flag {
( $x:expr ) => {{
Flag::new($x as u16).unwrap()
}};
}
macro_rules! flagset {
() => {{
FlagSet::default()
}};
( $( $x:expr ),* ) => {
{
FlagSet::from( $crate::alloc::vec![ $( Flag::new( $x as u16 ).unwrap() ),* ] )
}
};
}
#[test]
fn condition_matches() {
// No special characters
assert!("foo".parse::<Condition>().unwrap().matches("foo"));
// Fast lane: the input is shorter (bytes) than the number of characters in the pattern.
assert!(!"foo".parse::<Condition>().unwrap().matches("fo"));
// Positive character class
let condition = "xx[abc]x".parse::<Condition>().unwrap();
assert!(condition.matches("xxax"));
assert!(condition.matches("xxbx"));
assert!(condition.matches("xxcx"));
assert!(!condition.matches("xxdx"));
// Negative character class
let condition = "xx[^abc]x".parse::<Condition>().unwrap();
assert!(!condition.matches("xxax"));
assert!(!condition.matches("xxbx"));
assert!(!condition.matches("xxcx"));
assert!(condition.matches("xxdx"));
}
#[test]
fn condition_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L167-L299>
// Our structure for Condition is different so we only port the prefix tests.
let cond = "abcd".parse::<Condition>().unwrap();
assert!(cond.matches("abcd"));
assert!(cond.matches("abcdXYZ"));
assert!(cond.matches("abcdБВГДШ\u{ABCD}\u{10ABCD}"));
assert!(!cond.matches(""));
assert!(!cond.matches("abc"));
assert!(!cond.matches("abcX"));
assert!(!cond.matches("XYZ"));
assert!(!cond.matches("АаБбВвГгШш\u{ABCD}\u{10ABCD}"));
let cond = "[vbn]".parse::<Condition>().unwrap();
assert!(cond.matches("v"));
assert!(cond.matches("vAAш"));
assert!(cond.matches("b"));
assert!(cond.matches("bBBш"));
assert!(cond.matches("n"));
assert!(cond.matches("nCCш"));
assert!(!cond.matches(""));
assert!(!cond.matches("Q"));
assert!(!cond.matches("Qqqq"));
assert!(!cond.matches("1342234"));
assert!(!cond.matches("V"));
assert!(!cond.matches("бвгдш"));
let cond = "[бш\u{1234}]".parse::<Condition>().unwrap();
assert!(cond.matches("б"));
assert!(cond.matches("бeT"));
assert!(cond.matches("ш"));
assert!(cond.matches("шок"));
assert!(cond.matches("\u{1234}кош"));
assert!(!cond.matches(""));
assert!(!cond.matches("Q"));
assert!(!cond.matches("Qqqq"));
assert!(!cond.matches("пан"));
assert!(!cond.matches("\u{ABCD}\u{1234}"));
assert!(!cond.matches("вбгдш"));
let cond = "[^zш\u{1234}\u{10ABCD}]".parse::<Condition>().unwrap();
assert!(!cond.matches("z"));
assert!(!cond.matches("ш"));
assert!(!cond.matches("\u{1234}"));
assert!(!cond.matches("\u{10ABCD}"));
assert!(!cond.matches("zљње"));
assert!(!cond.matches("шabc"));
assert!(!cond.matches("\u{1234} ytyty"));
assert!(!cond.matches("\u{10ABCD} tytyty"));
assert!(cond.matches("q"));
assert!(cond.matches("r"));
assert!(cond.matches("\u{FFFD}"));
assert!(cond.matches("\u{10FFFF}"));
assert!(cond.matches("qљње"));
assert!(cond.matches("фabc"));
assert!(cond.matches("\u{FFFD} ytyty"));
assert!(cond.matches("\u{10FFFF} tytyty"));
let cond = "abc АБВ..[zбш\u{1234}][^zш\u{1234}\u{10ABCD}]X"
.parse::<Condition>()
.unwrap();
assert!(cond.matches("abc АБВ \u{2345}z\u{011111}X"));
assert!(cond.matches("abc АБВ\u{2345} ш\u{011112}Xопop"));
assert!(!cond.matches("abc ШШШ \u{2345}z\u{011111}X"));
assert!(!cond.matches("abc АБВ\u{2345} t\u{011112}Xопop"));
assert!(!cond.matches("abc АБВ \u{2345}z\u{1234}X"));
}
#[test]
fn string_pair() {
let pair = StrPair::new("foo", "bar");
assert_eq!(pair.full_str(), "foobar");
let pair = StrPair::new("", "");
assert_eq!(pair.full_str(), "")
}
#[test]
fn break_table_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L100-L127>
let table = BreakTable::new(&[
"bsd", "zxc", "asd", "^bar", "^zoo", "^abc", "car$", "yoyo$", "air$",
]);
let mut starts: Vec<_> = table.start_word_breaks().collect();
starts.sort_unstable();
assert_eq!(&["abc", "bar", "zoo"], starts.as_slice());
let mut middles: Vec<_> = table.middle_word_breaks().collect();
middles.sort_unstable();
assert_eq!(&["asd", "bsd", "zxc"], middles.as_slice());
let mut ends: Vec<_> = table.end_word_breaks().collect();
ends.sort_unstable();
assert_eq!(&["air", "car", "yoyo"], ends.as_slice());
}
#[test]
fn prefix_suffix_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L301-L313>
let prefix = Prefix::new(flag!('F'), false, Some("qw"), "Qwe", None, flagset![]).unwrap();
assert_eq!(prefix.to_derived("qwrty").as_str(), "Qwerty");
assert_eq!(prefix.to_stem("Qwerty").as_ref(), "qwrty");
let suffix = Suffix::new(flag!('F'), false, Some("ie"), "ying", None, flagset![]).unwrap();
assert_eq!(suffix.to_derived("pie").as_str(), "pying");
assert_eq!(suffix.to_stem("pying").as_ref(), "pie");
}
#[test]
fn empty_affix_index() {
let index: PrefixIndex = [].into_iter().collect();
assert!(index.affixes_of("anything").next().is_none());
let index: SuffixIndex = [].into_iter().collect();
assert!(index.affixes_of("anything").next().is_none());
}
#[test]
fn affix_index_prefix_multiset_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/b37faff6ea630a4a1bfb22097d455224b4239f8e/tests/unit_test.cxx#L315-L329>
fn prefix(add: &str) -> Prefix {
Prefix::new(Flag::new(1).unwrap(), true, None, add, None, flagset![]).unwrap()
}
let index: PrefixIndex = [
"", "a", "", "ab", "abx", "as", "asdf", "axx", "as", "bqwe", "ba", "rqwe",
]
.into_iter()
.map(prefix)
.collect();
let prefixes: Vec<_> = index
.affixes_of("asdfg")
.map(|prefix| prefix.add.as_ref())
.collect();
assert_eq!(&["", "", "a", "as", "as", "asdf"], prefixes.as_slice());
}
#[test]
fn affix_index_suffix_multiset_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/b37faff6ea630a4a1bfb22097d455224b4239f8e/tests/unit_test.cxx#L331-L345>
fn suffix(add: &str) -> Suffix {
Suffix::new(Flag::new(1).unwrap(), true, None, add, None, flagset![]).unwrap()
}
let index: SuffixIndex = [
"", "", "a", "b", "b", "ab", "ub", "zb", "aub", "uub", "xub", "huub",
]
.into_iter()
.map(suffix)
.collect();
let suffixes: Vec<_> = index
.affixes_of("ahahuub")
.map(|suffix| suffix.add.as_ref())
.collect();
assert_eq!(
&["", "", "b", "b", "ub", "uub", "huub"],
suffixes.as_slice()
);
}
#[test]
fn affix_index_en_us_suffix_example() {
// This suffix is from `en_US.aff`:
//
// SFX D Y 4
// SFX D 0 d e
// SFX D y ied [^aeiou]y
// SFX D 0 ed [^ey]
// SFX D 0 ed [aeiou]y
let flag = Flag::new('D' as u16).unwrap();
let suffix1 = Suffix::new(flag, true, None, "d", Some("e"), flagset![]).unwrap();
let suffix2 =
Suffix::new(flag, true, Some("y"), "ied", Some("[^aeiou]y"), flagset![]).unwrap();
let suffix3 = Suffix::new(flag, true, None, "ed", Some("[^ey]"), flagset![]).unwrap();
let suffix4 = Suffix::new(flag, true, None, "ed", Some("[aeiou]y"), flagset![]).unwrap();
let index: SuffixIndex = [&suffix1, &suffix2, &suffix3, &suffix4]
.into_iter()
.cloned()
.collect();
// From `en_US.dic`: `ace/DSMG`. The "ace" stem can be turned into "aced" with the above
// suffix rules, specifically the first rule (`suffix1`). However all of these suffixes
// except `suffix2` are returned by `affixes_of`.
let word = "aced";
let affixes: Vec<&Suffix> = index.affixes_of(word).collect();
assert_eq!(&[&suffix1, &suffix3, &suffix4], affixes.as_slice());
// Note: even though the condition can match, we would also need to look up the produced
// stem in the word list to confirm that "aced" is a valid word.
let stem1 = suffix1.to_stem(word);
assert_eq!(&stem1, "ace");
assert!(suffix1.condition_matches(&stem1));
let stem3 = suffix3.to_stem(word);
assert_eq!(&stem3, "ac");
assert!(suffix3.condition_matches(&stem3));
let stem4 = suffix4.to_stem(word);
assert_eq!(&stem4, "ac");
assert!(!suffix4.condition_matches(&stem4));
}
fn compound_rule_matches(pattern: &[CompoundRuleElement], data: &str) -> bool {
let flagsets: Vec<_> = data.chars().map(|ch| flagset!(ch)).collect();
let borrowed: Vec<_> = flagsets.iter().collect();
super::compound_rule_matches(pattern, &borrowed)
}
#[test]
fn compound_rule_matches_literal() {
let rule = parser::parse_compound_rule("abc", FlagType::default()).unwrap();
assert!(compound_rule_matches(&rule, "abc"));
assert!(!compound_rule_matches(&rule, "ac"));
assert!(!compound_rule_matches(&rule, "abcd"));
}
#[test]
fn compound_rule_matches_zero_or_one() {
let rule = parser::parse_compound_rule("ab?c", FlagType::default()).unwrap();
assert!(compound_rule_matches(&rule, "ac"));
assert!(compound_rule_matches(&rule, "abc"));
assert!(!compound_rule_matches(&rule, "ab"));
assert!(!compound_rule_matches(&rule, "bc"));
assert!(!compound_rule_matches(&rule, "abb"));
assert!(!compound_rule_matches(&rule, "abbc"));
}
#[test]
fn compound_rule_matches_zero_or_more() {
let rule = parser::parse_compound_rule("ab*c", FlagType::default()).unwrap();
assert!(compound_rule_matches(&rule, "ac"));
assert!(compound_rule_matches(&rule, "abc"));
assert!(compound_rule_matches(&rule, "abbc"));
assert!(compound_rule_matches(&rule, "abbbc"));
// etc.
assert!(!compound_rule_matches(&rule, "ab"));
assert!(!compound_rule_matches(&rule, "abb"));
assert!(!compound_rule_matches(&rule, "abbcc"));
}
#[test]
fn compound_rule_simple_regex_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/tests/unit_test.cxx#L384-L393>
let rule = parser::parse_compound_rule("abc?de*ff", FlagType::default()).unwrap();
assert!(compound_rule_matches(&rule, "abdff"));
assert!(compound_rule_matches(&rule, "abcdff"));
assert!(compound_rule_matches(&rule, "abdeeff"));
assert!(compound_rule_matches(&rule, "abcdeff"));
assert!(!compound_rule_matches(&rule, "abcdeeeefff"));
assert!(!compound_rule_matches(&rule, "qwerty"));
}
#[test]
fn casing_conversions_nuspell_unit_test() {
// Upstream: <https://github.com/nuspell/nuspell/blob/6e46eb31708003fa02796ee8dc0c9e57248ba141/tests/unit_test.cxx#L440-L448>
let word = "grüßEN";
assert_eq!(&CaseHandling::default().lowercase(word), "grüßen");
assert_eq!(&CaseHandling::default().uppercase(word), "GRÜSSEN");
assert_eq!(&CaseHandling::default().titlecase(word), "Grüßen");
let word = "isTAnbulI";
assert_eq!(&CaseHandling::default().lowercase(word), "istanbuli");
assert_eq!(&CaseHandling::default().uppercase(word), "ISTANBULI");
assert_eq!(&CaseHandling::default().titlecase(word), "Istanbuli");
assert_eq!(&CaseHandling::Turkic.lowercase(word), "istanbulı");
assert_eq!(&CaseHandling::Turkic.uppercase(word), "İSTANBULI");
assert_eq!(&CaseHandling::Turkic.titlecase(word), "İstanbulı");
}
}