crdb_core/
fts.rs

1use icu_casemap::CaseMapper;
2use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
3use icu_properties::sets::diacritic;
4use icu_segmenter::WordSegmenter;
5use rust_stemmers::{Algorithm, Stemmer};
6use std::fmt::Debug;
7use writeable::Writeable;
8
9thread_local! {
10    static SEGMENTER: WordSegmenter = WordSegmenter::new_auto();
11}
12
13const CASEMAPPER: CaseMapper = CaseMapper::new();
14const DECOMPOSER: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
15const RECOMPOSER: ComposingNormalizer = ComposingNormalizer::new_nfc();
16
17pub fn normalizer_version() -> i32 {
18    0
19}
20
21pub fn normalize(input: &str) -> String {
22    SEGMENTER.with(|segmenter| {
23        let mut res = String::with_capacity(input.len());
24        let mut last_brk = 0;
25        let mut segments = segmenter.segment_str(input);
26        let mut buf = String::new();
27        let mut buf2 = String::new();
28        // For each word
29        while let Some(next_brk) = segments.next() {
30            if segments.is_word_like() {
31                res.push(' ');
32
33                // Fold case
34                buf.clear();
35                CASEMAPPER
36                    .fold(&input[last_brk..next_brk])
37                    .write_to(&mut buf)
38                    .unwrap();
39                // And remove diacritics
40                buf2.clear();
41                buf2.extend(
42                    RECOMPOSER.normalize_iter(
43                        DECOMPOSER
44                            .normalize_iter(buf.chars())
45                            .filter(|c| !diacritic().contains(*c)),
46                    ),
47                );
48                // Finally, stem for french and english for now
49                // TODO(misc-med): think how to make this more international? applying two stemmers is bad(tm)
50                // We should probably be using eg. cld3 to detect the language, and then stem accordingly
51                res.push_str(
52                    &Stemmer::create(Algorithm::English)
53                        .stem(&Stemmer::create(Algorithm::French).stem(&buf2)),
54                );
55            }
56            last_brk = next_brk;
57        }
58        // Start and finish with a space, for easier matching
59        res.push(' ');
60        res
61    })
62}
63
64/// Assumes that both `value` and `pat` have already been `normalize`d. Checks whether
65/// `value` contains pattern `pat`.
66#[inline]
67pub fn matches(value: &str, pat: &str) -> bool {
68    value.contains(pat)
69}
70
71#[derive(Clone, deepsize::DeepSizeOf, educe::Educe, serde::Deserialize, serde::Serialize)]
72#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
73#[educe(Deref, DerefMut, Eq, Ord, PartialEq, PartialOrd)]
74#[serde(from = "SearchableStringSer", into = "SearchableStringSer")]
75pub struct SearchableString(#[educe(Deref, DerefMut)] pub String);
76
77impl SearchableString {
78    pub fn new() -> SearchableString {
79        SearchableString(String::new())
80    }
81}
82
83impl Default for SearchableString {
84    fn default() -> SearchableString {
85        SearchableString::new()
86    }
87}
88
89impl<T: Into<String>> From<T> for SearchableString {
90    fn from(value: T) -> SearchableString {
91        SearchableString(value.into())
92    }
93}
94
95impl Debug for SearchableString {
96    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
97        self.0.fmt(fmt)
98    }
99}
100
101#[derive(serde::Deserialize, serde::Serialize)]
102struct SearchableStringSer {
103    #[serde(rename = "_crdb-str")]
104    value: String,
105
106    #[serde(rename = "_crdb-normalized")]
107    normalized: String,
108}
109
110impl From<SearchableString> for SearchableStringSer {
111    fn from(value: SearchableString) -> SearchableStringSer {
112        let value: String = value.0;
113        SearchableStringSer {
114            normalized: normalize(&value),
115            value,
116        }
117    }
118}
119
120impl From<SearchableStringSer> for SearchableString {
121    fn from(value: SearchableStringSer) -> SearchableString {
122        SearchableString(value.value)
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    #[test]
129    fn basic_examples() {
130        let tests = [
131            ("Je   suis bien embêté !", " je sui bien embet "),
132            (
133                " Some 色々な言語の façon de faire un test :) ",
134                " som 色 々 な 言語 facon de fair un test ",
135            ),
136            ("ば", " は "), // japanese diacritics too
137            ("coupe-papier", " coup papi "),
138        ];
139        for (before, after) in tests {
140            assert_eq!(
141                super::normalize(before),
142                after,
143                "normalization of {before:?} didn't match",
144            );
145        }
146    }
147
148    #[test]
149    fn basic_matches() {
150        let tests = [
151            ("foobar", "foobar", true),
152            ("foobar", "", true),
153            ("foobar", "foo", false),
154            ("i think", "think", true),
155        ];
156        for (data, pat, res) in tests {
157            assert_eq!(
158                super::matches(&super::normalize(data), &super::normalize(pat)),
159                res,
160                "expected fts::matches({data:?}, {pat:?}) = {res:?} failed",
161            );
162        }
163    }
164
165    #[test]
166    fn fuzz_normalizer() {
167        bolero::check!().with_type().for_each(|s: &String| {
168            super::normalize(s);
169        });
170    }
171}