1use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::ne::NeTagger;
31use crate::ngram::char_ngrams;
32use crate::pos::{PosTag, PosTagger};
33use crate::romanizer::RomanizationMap;
34use crate::stopwords::StopwordSet;
35use crate::synonym::SynonymMap;
36use crate::token::{NamedEntityKind, TokenKind};
37use crate::Tokenizer;
38
39#[derive(Debug, Clone, PartialEq, Eq)]
41pub struct FtsToken {
42 pub text: String,
44 pub position: usize,
46 pub kind: TokenKind,
48 pub is_stop: bool,
50 pub synonyms: Vec<String>,
52 pub trigrams: Vec<String>,
54 pub pos: Option<PosTag>,
57 pub ne: Option<NamedEntityKind>,
60}
61
62#[derive(Default)]
64pub struct FtsTokenizerBuilder {
65 stopwords: Option<StopwordSet>,
66 synonyms: Option<SynonymMap>,
67 ngram_size: Option<usize>,
68 pos_tagger: Option<PosTagger>,
69 ne_tagger: Option<NeTagger>,
70 romanization: Option<RomanizationMap>,
71}
72
73impl FtsTokenizerBuilder {
74 pub fn stopwords(mut self, s: StopwordSet) -> Self {
76 self.stopwords = Some(s);
77 self
78 }
79
80 pub fn synonyms(mut self, m: SynonymMap) -> Self {
82 self.synonyms = Some(m);
83 self
84 }
85
86 pub fn ngram_size(mut self, n: usize) -> Self {
90 self.ngram_size = Some(n);
91 self
92 }
93
94 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
96 self.pos_tagger = Some(t);
97 self
98 }
99
100 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
102 self.ne_tagger = Some(t);
103 self
104 }
105
106 pub fn romanization(mut self, m: RomanizationMap) -> Self {
114 self.romanization = Some(m);
115 self
116 }
117
118 pub fn build(self) -> FtsTokenizer {
120 FtsTokenizer {
121 tokenizer: Tokenizer::new(),
122 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
123 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
124 ngram_size: self.ngram_size.unwrap_or(3),
125 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
126 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
127 romanization: self.romanization,
128 }
129 }
130}
131
132pub struct FtsTokenizer {
147 tokenizer: Tokenizer,
148 stopwords: StopwordSet,
149 synonyms: SynonymMap,
150 ngram_size: usize,
151 pos_tagger: PosTagger,
152 ne_tagger: NeTagger,
153 romanization: Option<RomanizationMap>,
154}
155
156impl FtsTokenizer {
157 pub fn new() -> Self {
159 FtsTokenizerBuilder::default().build()
160 }
161
162 pub fn builder() -> FtsTokenizerBuilder {
164 FtsTokenizerBuilder::default()
165 }
166
167 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
178 let normalized = self.tokenizer.normalize(text);
179 let raw_tokens = self
180 .ne_tagger
181 .tag_tokens(self.tokenizer.segment(&normalized), &normalized);
182
183 let mut result = Vec::with_capacity(raw_tokens.len());
184 let mut position = 0usize;
185
186 for token in &raw_tokens {
187 if token.kind == TokenKind::Whitespace {
188 continue;
189 }
190
191 let is_stop = self.stopwords.contains(token.text);
192 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
193 let mut synonyms = self
194 .synonyms
195 .expand(token.text)
196 .map(|s| s.to_vec())
197 .unwrap_or_default();
198 if is_thai_or_named {
199 if let Some(ref rom) = self.romanization {
200 if let Some(rtgs) = rom.romanize(token.text) {
201 synonyms.push(String::from(rtgs));
202 }
203 }
204 }
205 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
206 char_ngrams(token.text, self.ngram_size)
207 .map(String::from)
208 .collect()
209 } else {
210 Vec::new()
211 };
212 let ne = if let TokenKind::Named(k) = token.kind {
213 Some(k)
214 } else {
215 None
216 };
217 let pos = if token.kind == TokenKind::Thai {
218 self.pos_tagger.tag(token.text)
219 } else {
220 None
221 };
222
223 result.push(FtsToken {
224 text: String::from(token.text),
225 position,
226 kind: token.kind,
227 is_stop,
228 synonyms,
229 trigrams,
230 pos,
231 ne,
232 });
233
234 position += 1;
235 }
236
237 result
238 }
239
240 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
245 self.segment_for_fts(text)
246 .into_iter()
247 .filter(|t| !t.is_stop)
248 .collect()
249 }
250
251 pub fn lexemes(&self, text: &str) -> Vec<String> {
257 let tokens = self.index_tokens(text);
258 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
259 for t in tokens {
260 out.push(t.text.clone());
261 out.extend(t.synonyms);
262 out.extend(t.trigrams);
263 }
264 out
265 }
266}
267
268impl Default for FtsTokenizer {
269 fn default() -> Self {
270 Self::new()
271 }
272}
273
274#[cfg(test)]
279mod tests {
280 use super::*;
281 use crate::stopwords::StopwordSet;
282 use crate::synonym::SynonymMap;
283
284 fn fts() -> FtsTokenizer {
285 FtsTokenizer::new()
286 }
287
288 #[test]
291 fn empty_input_returns_empty() {
292 assert!(fts().segment_for_fts("").is_empty());
293 }
294
295 #[test]
296 fn whitespace_tokens_excluded() {
297 let tokens = fts().segment_for_fts("กิน ข้าว");
298 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
299 }
300
301 #[test]
302 fn positions_are_sequential() {
303 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
304 for (i, t) in tokens.iter().enumerate() {
305 assert_eq!(t.position, i, "position mismatch at index {i}");
306 }
307 }
308
309 #[test]
310 fn known_stopword_is_tagged() {
311 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
313 let kap = tokens.iter().find(|t| t.text == "กับ");
314 assert!(kap.is_some(), "expected 'กับ' token");
315 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
316 }
317
318 #[test]
319 fn content_words_not_tagged_as_stop() {
320 let tokens = fts().segment_for_fts("โรงพยาบาล");
321 for t in &tokens {
323 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
324 }
325 }
326
327 #[test]
328 fn text_is_reconstructable() {
329 let fts = fts();
331 let text = "กินข้าวกับปลา";
332 let normalized = fts.tokenizer.normalize(text);
333 let tokens = fts.segment_for_fts(text);
334 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
335 assert_eq!(rebuilt, normalized);
336 }
337
338 #[test]
341 fn synonym_expansion_attached() {
342 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
343 let fts = FtsTokenizer::builder()
344 .synonyms(synonyms)
345 .stopwords(StopwordSet::from_text(""))
346 .build();
347 let tokens = fts.segment_for_fts("คอม");
350 let t = tokens.iter().find(|t| t.text == "คอม");
351 if let Some(tok) = t {
352 assert!(
353 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
354 "expected synonym expansion, got {:?}",
355 tok.synonyms
356 );
357 }
358 }
359
360 #[test]
361 fn no_synonyms_when_map_empty() {
362 let tokens = fts().segment_for_fts("กินข้าว");
363 for t in &tokens {
364 assert!(t.synonyms.is_empty());
365 }
366 }
367
368 #[test]
371 fn unknown_token_gets_trigrams() {
372 let fts = FtsTokenizer::builder()
377 .ngram_size(2)
378 .stopwords(StopwordSet::from_text(""))
379 .build();
380 let tokens = fts.segment_for_fts("กิ");
381 let unknown: Vec<_> = tokens
382 .iter()
383 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
384 .collect();
385 assert!(
386 !unknown.is_empty(),
387 "expected at least one multi-char Unknown token for 'กิ'"
388 );
389 for u in &unknown {
390 assert!(
391 !u.trigrams.is_empty(),
392 "unknown token '{}' ({} chars) should have bigrams",
393 u.text,
394 u.text.chars().count()
395 );
396 }
397 }
398
399 #[test]
400 fn known_thai_token_has_no_trigrams() {
401 let tokens = fts().segment_for_fts("กิน");
402 for t in &tokens {
403 if t.kind == TokenKind::Thai {
404 assert!(
405 t.trigrams.is_empty(),
406 "known Thai token '{}' should not have trigrams",
407 t.text
408 );
409 }
410 }
411 }
412
413 #[test]
414 fn ngram_size_zero_disables_trigrams() {
415 let fts = FtsTokenizer::builder()
416 .ngram_size(0)
417 .stopwords(StopwordSet::from_text(""))
418 .build();
419 let tokens = fts.segment_for_fts("กขคง");
420 for t in &tokens {
421 assert!(t.trigrams.is_empty());
422 }
423 }
424
425 #[test]
428 fn index_tokens_excludes_stopwords() {
429 let tokens = fts().index_tokens("กินข้าวกับปลา");
430 assert!(tokens.iter().all(|t| !t.is_stop));
431 }
432
433 #[test]
434 fn index_tokens_preserves_positions() {
435 let all = fts().segment_for_fts("กินข้าวกับปลา");
437 let indexed = fts().index_tokens("กินข้าวกับปลา");
438 for t in &indexed {
439 assert!(
440 all.iter().any(|a| a.position == t.position),
441 "indexed token at position {} not found in full token list",
442 t.position
443 );
444 }
445 }
446
447 #[test]
450 fn lexemes_returns_non_stop_texts() {
451 let lexemes = fts().lexemes("กินข้าวกับปลา");
452 assert!(!lexemes.contains(&String::from("กับ")));
454 assert!(
456 lexemes
457 .iter()
458 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
459 "expected content words in lexemes: {lexemes:?}"
460 );
461 }
462
463 #[test]
464 fn lexemes_empty_input_is_empty() {
465 assert!(fts().lexemes("").is_empty());
466 }
467
468 #[test]
471 fn multi_token_ne_merged_in_pipeline() {
472 let fts = FtsTokenizer::new();
475 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
476 let named: Vec<_> = tokens
477 .iter()
478 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
479 .collect();
480 assert!(
481 named.iter().any(|t| t.text == "กรุงเทพ"),
482 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
483 tokens
484 .iter()
485 .map(|t| (&t.text, &t.kind))
486 .collect::<alloc::vec::Vec<_>>()
487 );
488 }
489
490 #[test]
491 fn multi_token_ne_reconstructable() {
492 let fts = FtsTokenizer::new();
494 let text = "ไปกรุงเทพ";
495 let normalized = fts.tokenizer.normalize(text);
496 let tokens = fts.segment_for_fts(text);
497 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
498 assert_eq!(rebuilt, normalized);
499 }
500
501 #[test]
504 fn builder_custom_stopwords() {
505 let stops = StopwordSet::from_text("กิน\n");
506 let fts = FtsTokenizer::builder().stopwords(stops).build();
507 let tokens = fts.segment_for_fts("กินข้าว");
508 let gin = tokens.iter().find(|t| t.text == "กิน");
509 if let Some(t) = gin {
510 assert!(t.is_stop, "'กิน' should be stop with custom list");
511 }
512 }
513
514 #[test]
515 fn builder_default_equals_new() {
516 let a = FtsTokenizer::new().lexemes("กินข้าว");
518 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
519 assert_eq!(a, b);
520 }
521}