1use icu_casemap::CaseMapper;
2use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
3use icu_properties::sets::diacritic;
4use icu_segmenter::WordSegmenter;
5use rust_stemmers::{Algorithm, Stemmer};
6use std::fmt::Debug;
7use writeable::Writeable;
8
9thread_local! {
10 static SEGMENTER: WordSegmenter = WordSegmenter::new_auto();
11}
12
13const CASEMAPPER: CaseMapper = CaseMapper::new();
14const DECOMPOSER: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
15const RECOMPOSER: ComposingNormalizer = ComposingNormalizer::new_nfc();
16
17pub fn normalizer_version() -> i32 {
18 0
19}
20
21pub fn normalize(input: &str) -> String {
22 SEGMENTER.with(|segmenter| {
23 let mut res = String::with_capacity(input.len());
24 let mut last_brk = 0;
25 let mut segments = segmenter.segment_str(input);
26 let mut buf = String::new();
27 let mut buf2 = String::new();
28 while let Some(next_brk) = segments.next() {
30 if segments.is_word_like() {
31 res.push(' ');
32
33 buf.clear();
35 CASEMAPPER
36 .fold(&input[last_brk..next_brk])
37 .write_to(&mut buf)
38 .unwrap();
39 buf2.clear();
41 buf2.extend(
42 RECOMPOSER.normalize_iter(
43 DECOMPOSER
44 .normalize_iter(buf.chars())
45 .filter(|c| !diacritic().contains(*c)),
46 ),
47 );
48 res.push_str(
52 &Stemmer::create(Algorithm::English)
53 .stem(&Stemmer::create(Algorithm::French).stem(&buf2)),
54 );
55 }
56 last_brk = next_brk;
57 }
58 res.push(' ');
60 res
61 })
62}
63
64#[inline]
67pub fn matches(value: &str, pat: &str) -> bool {
68 value.contains(pat)
69}
70
71#[derive(Clone, deepsize::DeepSizeOf, educe::Educe, serde::Deserialize, serde::Serialize)]
72#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
73#[educe(Deref, DerefMut, Eq, Ord, PartialEq, PartialOrd)]
74#[serde(from = "SearchableStringSer", into = "SearchableStringSer")]
75pub struct SearchableString(#[educe(Deref, DerefMut)] pub String);
76
77impl SearchableString {
78 pub fn new() -> SearchableString {
79 SearchableString(String::new())
80 }
81}
82
83impl Default for SearchableString {
84 fn default() -> SearchableString {
85 SearchableString::new()
86 }
87}
88
89impl<T: Into<String>> From<T> for SearchableString {
90 fn from(value: T) -> SearchableString {
91 SearchableString(value.into())
92 }
93}
94
95impl Debug for SearchableString {
96 fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
97 self.0.fmt(fmt)
98 }
99}
100
101#[derive(serde::Deserialize, serde::Serialize)]
102struct SearchableStringSer {
103 #[serde(rename = "_crdb-str")]
104 value: String,
105
106 #[serde(rename = "_crdb-normalized")]
107 normalized: String,
108}
109
110impl From<SearchableString> for SearchableStringSer {
111 fn from(value: SearchableString) -> SearchableStringSer {
112 let value: String = value.0;
113 SearchableStringSer {
114 normalized: normalize(&value),
115 value,
116 }
117 }
118}
119
120impl From<SearchableStringSer> for SearchableString {
121 fn from(value: SearchableStringSer) -> SearchableString {
122 SearchableString(value.value)
123 }
124}
125
126#[cfg(test)]
127mod tests {
128 #[test]
129 fn basic_examples() {
130 let tests = [
131 ("Je suis bien embêté !", " je sui bien embet "),
132 (
133 " Some 色々な言語の façon de faire un test :) ",
134 " som 色 々 な 言語 facon de fair un test ",
135 ),
136 ("ば", " は "), ("coupe-papier", " coup papi "),
138 ];
139 for (before, after) in tests {
140 assert_eq!(
141 super::normalize(before),
142 after,
143 "normalization of {before:?} didn't match",
144 );
145 }
146 }
147
148 #[test]
149 fn basic_matches() {
150 let tests = [
151 ("foobar", "foobar", true),
152 ("foobar", "", true),
153 ("foobar", "foo", false),
154 ("i think", "think", true),
155 ];
156 for (data, pat, res) in tests {
157 assert_eq!(
158 super::matches(&super::normalize(data), &super::normalize(pat)),
159 res,
160 "expected fts::matches({data:?}, {pat:?}) = {res:?} failed",
161 );
162 }
163 }
164
165 #[test]
166 fn fuzz_normalizer() {
167 bolero::check!().with_type().for_each(|s: &String| {
168 super::normalize(s);
169 });
170 }
171}