1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
use std::borrow::Cow;

use crate::ascii_folding_filter;

#[inline(always)]
pub fn split_terms(c: char) -> bool {
    c.is_whitespace() || separating_filter(c)
}

#[allow(clippy::match_like_matches_macro)]
fn boundary_filter(c: char) -> bool {
    match c {
        'a'..='z' |
        '0'..='9'
        => false,
        _ => true
    }
}

// Things that commonly "separate" words, apart from whitespaces
pub fn separating_filter(c: char) -> bool {
    match c {
        '[' |
        ']' |
        '\\' |
        '(' |
        ')' |
        '{' |
        '}' |
        '&' |
        '|' |
        '"' |
        '`' |
        '<' |
        '>' |
        '#' |
        ':' |
        ';' |
        '~' |
        '^' |
        '=' |
        '-' |
        '+' |
        '*' |
        '/' |
        '。' |
        '《' |
        '》' |
        '…' |
        '•' |
        '?' |
        '!' |
        ',' |
        '.' |
        '@' |
        '\u{2045}' | // ⁅  [LEFT SQUARE BRACKET WITH QUILL]
        '\u{2772}' | // ❲  [LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT]
        '\u{FF3B}' | // [  [FULLWIDTH LEFT SQUARE BRACKET]
        '\u{2046}' | // ⁆  [RIGHT SQUARE BRACKET WITH QUILL]
        '\u{2773}' | // ❳  [LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT]
        '\u{FF3D}' | // ]  [FULLWIDTH RIGHT SQUARE BRACKET]
        '\u{FF3C}' | // \  [FULLWIDTH REVERSE SOLIDUS]
        '\u{207D}' | // ⁽  [SUPERSCRIPT LEFT PARENTHESIS]
        '\u{208D}' | // ₍  [SUBSCRIPT LEFT PARENTHESIS]
        '\u{2768}' | // ❨  [MEDIUM LEFT PARENTHESIS ORNAMENT]
        '\u{276A}' | // ❪  [MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT]
        '\u{FF08}' | // (  [FULLWIDTH LEFT PARENTHESIS]
        '\u{2E28}' | // ⸨  [LEFT DOUBLE PARENTHESIS]
        '\u{207E}' | // ⁾  [SUPERSCRIPT RIGHT PARENTHESIS]
        '\u{208E}' | // ₎  [SUBSCRIPT RIGHT PARENTHESIS]
        '\u{2769}' | // ❩  [MEDIUM RIGHT PARENTHESIS ORNAMENT]
        '\u{276B}' | // ❫  [MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT]
        '\u{FF09}' | // )  [FULLWIDTH RIGHT PARENTHESIS]
        '\u{2E29}' | // ⸩  [RIGHT DOUBLE PARENTHESIS]
        '\u{2774}' | // ❴  [MEDIUM LEFT CURLY BRACKET ORNAMENT]
        '\u{FF5B}' | // {  [FULLWIDTH LEFT CURLY BRACKET]
        '\u{2775}' | // ❵  [MEDIUM RIGHT CURLY BRACKET ORNAMENT]
        '\u{FF5D}' | // }  [FULLWIDTH RIGHT CURLY BRACKET]
        '\u{FF06}' | // &  [FULLWIDTH AMPERSAND]
        '\u{00AB}' | // «  [LEFT-POINTING DOUBLE ANGLE QUOTATION MARK]
        '\u{00BB}' | // »  [RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK]
        '\u{201C}' | // “  [LEFT DOUBLE QUOTATION MARK]
        '\u{201D}' | // ”  [RIGHT DOUBLE QUOTATION MARK]
        '\u{201E}' | // „  [DOUBLE LOW-9 QUOTATION MARK]
        '\u{275D}' | // ❝  [HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT]
        '\u{275E}' | // ❞  [HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT]
        '\u{276E}' | // ❮  [HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT]
        '\u{276F}' | // ❯  [HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT]
        '\u{FF02}' | // "  [FULLWIDTH QUOTATION MARK]
        '\u{276C}' | // ❬  [MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT]
        '\u{2770}' | // ❰  [HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT]
        '\u{FF1C}' | // <  [FULLWIDTH LESS-THAN SIGN]
        '\u{276D}' | // ❭  [MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT]
        '\u{2771}' | // ❱  [HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT]
        '\u{FF1E}' | // >  [FULLWIDTH GREATER-THAN SIGN]
        '\u{FF03}' | // #  [FULLWIDTH NUMBER SIGN]
        '\u{FF1A}' | // :  [FULLWIDTH COLON]
        '\u{204F}' | // ⁏  [REVERSED SEMICOLON]
        '\u{FF1B}' | // ;  [FULLWIDTH SEMICOLON]
        '\u{2053}' | // ⁓  [SWUNG DASH]
        '\u{FF5E}' | // ~  [FULLWIDTH TILDE]
        '\u{2038}' | // ‸  [CARET]
        '\u{FF3E}' | // ^  [FULLWIDTH CIRCUMFLEX ACCENT]
        '\u{207C}' | // ⁼  [SUPERSCRIPT EQUALS SIGN]
        '\u{208C}' | // ₌  [SUBSCRIPT EQUALS SIGN]
        '\u{FF1D}' | // =  [FULLWIDTH EQUALS SIGN]
        '\u{207A}' | // ⁺  [SUPERSCRIPT PLUS SIGN]
        '\u{208A}' | // ₊  [SUBSCRIPT PLUS SIGN]
        '\u{FF0B}' | // +  [FULLWIDTH PLUS SIGN]
        '\u{204E}' | // ⁎  [LOW ASTERISK]
        '\u{FF0A}' | // *  [FULLWIDTH ASTERISK]
        '\u{2044}' | // ⁄  [FRACTION SLASH]
        '\u{FF0F}' | // /  [FULLWIDTH SOLIDUS]
        '\u{2049}' | // ⁉  [EXCLAMATION QUESTION MARK]
        '\u{FF1F}' | // ?  [FULLWIDTH QUESTION MARK]
        '\u{2047}' | // ⁇  [DOUBLE QUESTION MARK]
        '\u{FF01}' | // !  [FULLWIDTH EXCLAMATION MARK]
        '\u{203C}' | // ‼  [DOUBLE EXCLAMATION MARK]
        '\u{2048}' | // ⁈  [QUESTION EXCLAMATION MARK]
        '\u{FF0C}' | // ,  [FULLWIDTH COMMA]
        '\u{FF0E}' | // .  [FULLWIDTH FULL STOP]
        '\u{FF20}' | // @  [FULLWIDTH COMMERCIAL AT]
        // More controversial ones
        '\u{2013}' | // –  [EN DASH]
        '\u{2014}' | // —  [EM DASH]
        '\u{201A}' | // ‚  [SINGLE LOW-9 QUOTATION MARK]
        '\u{2039}' | // ‹  [SINGLE LEFT-POINTING ANGLE QUOTATION MARK]
        '\u{203A}' | // ›  [SINGLE RIGHT-POINTING ANGLE QUOTATION MARK]
        '\u{275B}' | // ❛  [HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT]
        '\u{275C}'   // ❜  [HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT]
        => true,
        _ => false
    }
}

// Things that commonly appear within words
pub fn intra_filter(c: char) -> bool {
    match c {
        '\'' |
        '-' |
        '_' |
        '*' |
        '\u{2019}' | // ’  [RIGHT SINGLE QUOTATION MARK]
        '\u{2018}' | // ‘  [LEFT SINGLE QUOTATION MARK]
        '\u{2010}' | // ‐  [HYPHEN]
        '\u{2011}' | // ‑  [NON-BREAKING HYPHEN]
        '\u{2012}' | // ‒  [FIGURE DASH]
        '\u{207B}' | // ⁻  [SUPERSCRIPT MINUS]
        '\u{208B}' | // ₋  [SUBSCRIPT MINUS]
        '\u{FF0D}' | // -  [FULLWIDTH HYPHEN-MINUS]
        '\u{FF3F}' | // _  [FULLWIDTH LOW LINE]
        '\u{FF07}' | // '  [FULLWIDTH APOSTROPHE]
        '\u{2032}' | // ′  [PRIME]
        '\u{2035}' | // ‵  [REVERSED PRIME]
        '\u{201B}' | // ‛  [SINGLE HIGH-REVERSED-9 QUOTATION MARK]
        // Moved from above
        '\u{2033}' | // ″  [DOUBLE PRIME]
        '\u{2036}'   // ‶  [REVERSED DOUBLE PRIME]
        => true,
        _ => false
    }
}

pub fn term_filter(input: Cow<str>) -> Cow<str> {
    let mut char_iter = input.char_indices().filter(|(_idx, c)| boundary_filter(*c));

    if let Some((mut char_start, mut c)) = char_iter.next() {
        let mut output: Vec<u8> = Vec::with_capacity(input.len());
        let mut prev_seg_start = 0;
        let mut prev_char_end = 0;
        let mut prev_seg_end = 0;

        loop {
            if prev_char_end != char_start {
                prev_seg_start = char_start;
            }

            prev_char_end = char_start + c.len_utf8();

            if prev_seg_start == 0 || intra_filter(c) {
                // Guarantees:
                // prev_seg_end is never assigned the last character's end before this op
                // char_start is always assigned to the start of a character's index in the input string
                output.extend_from_slice(unsafe { input.get_unchecked(prev_seg_end..char_start) }.as_bytes());
                prev_seg_end = prev_char_end;
            }

            debug_assert!(prev_seg_start != prev_seg_end);

            if let Some((next_idx, next_c)) = char_iter.next() {
                char_start = next_idx;
                c = next_c;
            } else {
                let has_end_boundary_seg = prev_char_end == input.len();
                let end = if prev_seg_start > prev_seg_end && has_end_boundary_seg {
                    prev_seg_start
                } else {
                    input.len()
                };

                // Guarantees:
                // - prev_seg_start is never the last character's end.
                // - prev_seg_end is the last character's end at most.
                //   - if it is the last character, prev_seg_start < prev_seg_end, and end = input.len()
                //   - if not,
                //     - if prev_seg_start > prev_seg_end
                //        - if has_end_boundary_seg, prev_seg_start = start of the end boundary segment
                //        - if !has_end_boundary_seg, prev_seg_start = start of an intermediate, non filterable segment
                //     - if prev_seg_start < prev_seg_end, the segment is still being contiguously filtered
                // - slicing str[str.len()..] is still valid.
                output.extend_from_slice(unsafe { input.get_unchecked(prev_seg_end..end) }.as_bytes());
                return Cow::Owned(unsafe { String::from_utf8_unchecked(output) });
            }
        }
    } else {
        input
    }
}

pub fn ascii_and_nonword_filter<'a, F: Fn(Cow<str>) -> Cow<str>>(
    term_inflections: &mut Vec<String>,
    term_slice: &'a str,
    term_filter: F,
) -> Cow<'a, str> {
    term_inflections.push(term_slice.to_owned());

    let mut ascii_replaced = ascii_folding_filter::to_ascii(term_slice);
    if let Cow::Owned(ascii_replaced_inner) = ascii_replaced {
        if !ascii_replaced_inner.is_empty() {
            term_inflections.push(ascii_replaced_inner.clone());
        }
        ascii_replaced = Cow::Owned(ascii_replaced_inner);
    }

    if ascii_replaced.contains('\'') {
        // Somewhat hardcoded fix for this common keyboard "issue"
        term_inflections.push(ascii_replaced.replace('\'', "’"));
    }

    let term_filtered = term_filter(ascii_replaced);
    if let Cow::Owned(inner) = term_filtered {
        if !inner.trim().is_empty() {
            term_inflections.push(inner.clone());
        }
        Cow::Owned(inner)
    } else {
        term_filtered
    }
}

#[cfg(test)]
pub mod test {
    use std::borrow::Cow;

    use super::term_filter;

    fn assert(input: &str, expected: &str) {
        assert_eq!(term_filter(Cow::Borrowed(input)), expected);
    }

    #[test]
    fn removes_single_characters() {
        assert("我", "");
        assert("-", "");
        assert("⥄", "");
        assert("a", "a");
    }

    #[test]
    fn removes_intermediate_characters() {
        assert("a1a-a2a", "a1aa2a");
        assert("a1a-'_a2a", "a1aa2a");
        assert("a1a⥄a2a", "a1a⥄a2a");
        assert("a1a-'⥄a2a", "a1a⥄a2a");
    }

    #[test]
    fn removes_starting_characters() {
        assert("-a1aa2a", "a1aa2a");
        assert("⥄a1aa2a", "a1aa2a");
        assert("⥄⥄a1aa2a", "a1aa2a");
    }

    #[test]
    fn removes_ending_characters() {
        assert("a1aa2a-", "a1aa2a");
        assert("a1aa2a*", "a1aa2a");
        assert("a1aa2a⥄", "a1aa2a");
        assert("a1aa2a⥄⥄", "a1aa2a");
    }

    #[test]
    fn removes_all_characters() {
        assert("-a1a-a2a-", "a1aa2a");
        assert("-a1a⥄a2a-", "a1a⥄a2a");
    }

    #[test]
    fn removes_single_quotations() {
        assert("today's", "todays");
        assert("today‛s", "todays");
        assert("today’s", "todays");
        assert("today‘s", "todays");
    }
}