1use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::ngram::char_ngrams;
31use crate::stopwords::StopwordSet;
32use crate::synonym::SynonymMap;
33use crate::token::TokenKind;
34use crate::Tokenizer;
35
36#[derive(Debug, Clone, PartialEq, Eq)]
38pub struct FtsToken {
39 pub text: String,
41 pub position: usize,
43 pub kind: TokenKind,
45 pub is_stop: bool,
47 pub synonyms: Vec<String>,
49 pub trigrams: Vec<String>,
51}
52
53#[derive(Default)]
55pub struct FtsTokenizerBuilder {
56 stopwords: Option<StopwordSet>,
57 synonyms: Option<SynonymMap>,
58 ngram_size: Option<usize>,
59}
60
61impl FtsTokenizerBuilder {
62 pub fn stopwords(mut self, s: StopwordSet) -> Self {
64 self.stopwords = Some(s);
65 self
66 }
67
68 pub fn synonyms(mut self, m: SynonymMap) -> Self {
70 self.synonyms = Some(m);
71 self
72 }
73
74 pub fn ngram_size(mut self, n: usize) -> Self {
78 self.ngram_size = Some(n);
79 self
80 }
81
82 pub fn build(self) -> FtsTokenizer {
84 FtsTokenizer {
85 tokenizer: Tokenizer::new(),
86 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
87 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
88 ngram_size: self.ngram_size.unwrap_or(3),
89 }
90 }
91}
92
93pub struct FtsTokenizer {
108 tokenizer: Tokenizer,
109 stopwords: StopwordSet,
110 synonyms: SynonymMap,
111 ngram_size: usize,
112}
113
114impl FtsTokenizer {
115 pub fn new() -> Self {
117 FtsTokenizerBuilder::default().build()
118 }
119
120 pub fn builder() -> FtsTokenizerBuilder {
122 FtsTokenizerBuilder::default()
123 }
124
125 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
136 let normalized = self.tokenizer.normalize(text);
137 let raw_tokens = self.tokenizer.segment(&normalized);
138
139 let mut result = Vec::with_capacity(raw_tokens.len());
140 let mut position = 0usize;
141
142 for token in &raw_tokens {
143 if token.kind == TokenKind::Whitespace {
144 continue;
145 }
146
147 let is_stop = self.stopwords.contains(token.text);
148 let synonyms = self
149 .synonyms
150 .expand(token.text)
151 .map(|s| s.to_vec())
152 .unwrap_or_default();
153 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
154 char_ngrams(token.text, self.ngram_size)
155 .map(String::from)
156 .collect()
157 } else {
158 Vec::new()
159 };
160
161 result.push(FtsToken {
162 text: String::from(token.text),
163 position,
164 kind: token.kind,
165 is_stop,
166 synonyms,
167 trigrams,
168 });
169
170 position += 1;
171 }
172
173 result
174 }
175
176 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
181 self.segment_for_fts(text)
182 .into_iter()
183 .filter(|t| !t.is_stop)
184 .collect()
185 }
186
187 pub fn lexemes(&self, text: &str) -> Vec<String> {
193 let tokens = self.index_tokens(text);
194 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
195 for t in tokens {
196 out.push(t.text.clone());
197 out.extend(t.synonyms);
198 out.extend(t.trigrams);
199 }
200 out
201 }
202}
203
204impl Default for FtsTokenizer {
205 fn default() -> Self {
206 Self::new()
207 }
208}
209
210#[cfg(test)]
215mod tests {
216 use super::*;
217 use crate::stopwords::StopwordSet;
218 use crate::synonym::SynonymMap;
219
220 fn fts() -> FtsTokenizer {
221 FtsTokenizer::new()
222 }
223
224 #[test]
227 fn empty_input_returns_empty() {
228 assert!(fts().segment_for_fts("").is_empty());
229 }
230
231 #[test]
232 fn whitespace_tokens_excluded() {
233 let tokens = fts().segment_for_fts("กิน ข้าว");
234 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
235 }
236
237 #[test]
238 fn positions_are_sequential() {
239 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
240 for (i, t) in tokens.iter().enumerate() {
241 assert_eq!(t.position, i, "position mismatch at index {i}");
242 }
243 }
244
245 #[test]
246 fn known_stopword_is_tagged() {
247 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
249 let kap = tokens.iter().find(|t| t.text == "กับ");
250 assert!(kap.is_some(), "expected 'กับ' token");
251 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
252 }
253
254 #[test]
255 fn content_words_not_tagged_as_stop() {
256 let tokens = fts().segment_for_fts("โรงพยาบาล");
257 for t in &tokens {
259 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
260 }
261 }
262
263 #[test]
264 fn text_is_reconstructable() {
265 let fts = fts();
267 let text = "กินข้าวกับปลา";
268 let normalized = fts.tokenizer.normalize(text);
269 let tokens = fts.segment_for_fts(text);
270 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
271 assert_eq!(rebuilt, normalized);
272 }
273
274 #[test]
277 fn synonym_expansion_attached() {
278 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
279 let fts = FtsTokenizer::builder()
280 .synonyms(synonyms)
281 .stopwords(StopwordSet::from_text(""))
282 .build();
283 let tokens = fts.segment_for_fts("คอม");
286 let t = tokens.iter().find(|t| t.text == "คอม");
287 if let Some(tok) = t {
288 assert!(
289 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
290 "expected synonym expansion, got {:?}",
291 tok.synonyms
292 );
293 }
294 }
295
296 #[test]
297 fn no_synonyms_when_map_empty() {
298 let tokens = fts().segment_for_fts("กินข้าว");
299 for t in &tokens {
300 assert!(t.synonyms.is_empty());
301 }
302 }
303
304 #[test]
307 fn unknown_token_gets_trigrams() {
308 let fts = FtsTokenizer::builder()
313 .ngram_size(2)
314 .stopwords(StopwordSet::from_text(""))
315 .build();
316 let tokens = fts.segment_for_fts("กิ");
317 let unknown: Vec<_> = tokens
318 .iter()
319 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
320 .collect();
321 assert!(
322 !unknown.is_empty(),
323 "expected at least one multi-char Unknown token for 'กิ'"
324 );
325 for u in &unknown {
326 assert!(
327 !u.trigrams.is_empty(),
328 "unknown token '{}' ({} chars) should have bigrams",
329 u.text,
330 u.text.chars().count()
331 );
332 }
333 }
334
335 #[test]
336 fn known_thai_token_has_no_trigrams() {
337 let tokens = fts().segment_for_fts("กิน");
338 for t in &tokens {
339 if t.kind == TokenKind::Thai {
340 assert!(
341 t.trigrams.is_empty(),
342 "known Thai token '{}' should not have trigrams",
343 t.text
344 );
345 }
346 }
347 }
348
349 #[test]
350 fn ngram_size_zero_disables_trigrams() {
351 let fts = FtsTokenizer::builder()
352 .ngram_size(0)
353 .stopwords(StopwordSet::from_text(""))
354 .build();
355 let tokens = fts.segment_for_fts("กขคง");
356 for t in &tokens {
357 assert!(t.trigrams.is_empty());
358 }
359 }
360
361 #[test]
364 fn index_tokens_excludes_stopwords() {
365 let tokens = fts().index_tokens("กินข้าวกับปลา");
366 assert!(tokens.iter().all(|t| !t.is_stop));
367 }
368
369 #[test]
370 fn index_tokens_preserves_positions() {
371 let all = fts().segment_for_fts("กินข้าวกับปลา");
373 let indexed = fts().index_tokens("กินข้าวกับปลา");
374 for t in &indexed {
375 assert!(
376 all.iter().any(|a| a.position == t.position),
377 "indexed token at position {} not found in full token list",
378 t.position
379 );
380 }
381 }
382
383 #[test]
386 fn lexemes_returns_non_stop_texts() {
387 let lexemes = fts().lexemes("กินข้าวกับปลา");
388 assert!(!lexemes.contains(&String::from("กับ")));
390 assert!(
392 lexemes
393 .iter()
394 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
395 "expected content words in lexemes: {lexemes:?}"
396 );
397 }
398
399 #[test]
400 fn lexemes_empty_input_is_empty() {
401 assert!(fts().lexemes("").is_empty());
402 }
403
404 #[test]
407 fn builder_custom_stopwords() {
408 let stops = StopwordSet::from_text("กิน\n");
409 let fts = FtsTokenizer::builder().stopwords(stops).build();
410 let tokens = fts.segment_for_fts("กินข้าว");
411 let gin = tokens.iter().find(|t| t.text == "กิน");
412 if let Some(t) = gin {
413 assert!(t.is_stop, "'กิน' should be stop with custom list");
414 }
415 }
416
417 #[test]
418 fn builder_default_equals_new() {
419 let a = FtsTokenizer::new().lexemes("กินข้าว");
421 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
422 assert_eq!(a, b);
423 }
424}