1mod base;
2mod index;
3mod ngram;
4mod pipeline;
5mod search;
6mod tokenizer;
7mod types;
8
9pub use types::{
10 DocData, InMemoryIndex, PositionEncoding, SNAPSHOT_VERSION, SearchHit, SearchMode,
11 SnapshotData, TermDomain,
12};
13
14pub use tokenizer::dictionary::{
15 DictionaryConfig, DictionaryLanguage, DictionarySegmenter, ScriptDictionary,
16 train_dictionary_config,
17};
18
19#[cfg(test)]
20mod tests {
21 use super::types::MatchedTerm;
22 use super::*;
23 use std::collections::HashSet;
24 use tempfile::tempdir;
25
26 const INDEX: &str = "test-index";
58 const DOC_CN: &str = "doc-cn";
59 const DOC_EN: &str = "doc-en";
60 const DOC_JP: &str = "doc-jp";
61
62 fn assert_contains_doc(results: &[(String, f64)], doc_id: &str) {
63 assert!(
64 results.iter().any(|(id, _)| id == doc_id),
65 "expected results to contain doc {doc_id}, got {:?}",
66 results
67 );
68 }
69
70 #[test]
71 fn chinese_full_pinyin_search() {
72 let mut index = InMemoryIndex::default();
73 index.add_doc(INDEX, DOC_CN, "你好世界", true);
74
75 let hits = index.search(INDEX, "nihao");
76 assert_contains_doc(&hits, DOC_CN);
77 }
78
79 #[test]
80 fn chinese_initials_search() {
81 let mut index = InMemoryIndex::default();
82 index.add_doc(INDEX, DOC_CN, "你好世界", true);
83
84 let hits = index.search(INDEX, "nh");
85 assert_contains_doc(&hits, DOC_CN);
86 }
87
88 #[test]
89 fn chinese_initials_prefix_search() {
90 let mut index = InMemoryIndex::default();
91 index.add_doc(INDEX, DOC_CN, "你好世界", true);
92
93 let hits = index.search(INDEX, "nhs");
94 assert_contains_doc(&hits, DOC_CN);
95
96 let exact = index.get_matches(INDEX, DOC_CN, "nhsj");
97 assert!(!exact.is_empty());
98 let hit = index
99 .search_hits(INDEX, "nhs")
100 .into_iter()
101 .find(|h| h.doc_id == DOC_CN)
102 .expect("expected hit for prefix query");
103 let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
104 assert!(!prefix_matches.is_empty());
105 assert!(
106 prefix_matches
107 .iter()
108 .any(|p| exact.iter().any(|e| e.0 == p.0)),
109 "prefix highlight should align to original start"
110 );
111 }
112
113 #[test]
114 fn chinese_full_pinyin_prefix_search() {
115 let mut index = InMemoryIndex::default();
116 index.add_doc(INDEX, DOC_CN, "你好世界", true);
117
118 let hits = index.search(INDEX, "nih");
119 assert_contains_doc(&hits, DOC_CN);
120
121 let exact = index.get_matches(INDEX, DOC_CN, "nihaoshijie");
122 assert!(!exact.is_empty());
123 let hit = index
124 .search_hits(INDEX, "nih")
125 .into_iter()
126 .find(|h| h.doc_id == DOC_CN)
127 .expect("expected hit for prefix query");
128 let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
129 assert!(!prefix_matches.is_empty());
130 assert!(
131 prefix_matches
132 .iter()
133 .any(|p| exact.iter().any(|e| e.0 == p.0)),
134 "prefix highlight should align to original start"
135 );
136 }
137
138 #[test]
139 fn pinyin_fuzzy_search() {
140 let mut index = InMemoryIndex::default();
141 index.add_doc(INDEX, DOC_CN, "你好世界", true);
142
143 let hits = index.search_hits(INDEX, "nihap");
144 assert!(
145 hits.iter()
146 .any(|h| h.doc_id == DOC_CN && !h.matched_terms.is_empty()),
147 "expected matched pinyin term in fuzzy hits: {:?}",
148 hits.iter()
149 .map(|h| (&h.doc_id, &h.matched_terms))
150 .collect::<Vec<_>>()
151 );
152
153 let fuzzy_original = index.search_with_mode(INDEX, "nihap", SearchMode::Fuzzy);
154 assert!(
155 fuzzy_original.is_empty(),
156 "expected SearchMode::Fuzzy to only search original domain, got {:?}",
157 fuzzy_original
158 );
159 }
160
161 #[test]
162 fn english_fuzzy_search() {
163 let mut index = InMemoryIndex::default();
164 index.add_doc(INDEX, DOC_EN, "fuzzy search handles typos", true);
165
166 let hits = index.search_hits(INDEX, "fuzze");
167 assert!(hits.iter().any(|h| {
168 h.doc_id == DOC_EN
169 && h.matched_terms
170 .iter()
171 .any(|t| t.term == "fuzzy" && t.domain == TermDomain::Original)
172 }));
173 }
174
175 #[test]
176 fn english_query_splits_separators_and_lowercases() {
177 let mut index = InMemoryIndex::default();
178 index.add_doc(INDEX, DOC_EN, "MEMORY-INDEXER", true);
179
180 let hits = index.search_with_mode(INDEX, "memory-indexer", SearchMode::Exact);
181 assert_contains_doc(&hits, DOC_EN);
182 }
183
184 #[test]
185 fn fuzzy_search_allows_alphanumeric_terms() {
186 let mut index = InMemoryIndex::default();
187 index.add_doc(INDEX, DOC_EN, "version2 stable", true);
188
189 let hits = index.search_with_mode(INDEX, "versoin2", SearchMode::Fuzzy);
190 assert_contains_doc(&hits, DOC_EN);
191 }
192
193 #[test]
194 fn fuzzy_search_handles_separated_query_terms() {
195 let mut index = InMemoryIndex::default();
196 index.add_doc(INDEX, DOC_EN, "memory-indexer", true);
197
198 let hits = index.search_with_mode(INDEX, "memry-indexer", SearchMode::Fuzzy);
199 assert_contains_doc(&hits, DOC_EN);
200 }
201
202 #[test]
203 fn fuzzy_search_handles_short_terms() {
204 let mut index = InMemoryIndex::default();
205 index.add_doc(INDEX, DOC_EN, "go go", true);
206
207 let hits = index.search_with_mode(INDEX, "go", SearchMode::Fuzzy);
208 assert_contains_doc(&hits, DOC_EN);
209 }
210
211 #[test]
212 fn pinyin_highlight_uses_original_positions() {
213 let mut index = InMemoryIndex::default();
214 index.add_doc(INDEX, DOC_CN, "你好世界", true);
215
216 let direct = index.get_matches(INDEX, DOC_CN, "你好");
217 assert!(
218 !direct.is_empty(),
219 "expected direct chinese match to have positions"
220 );
221
222 let pinyin = index.get_matches(INDEX, DOC_CN, "nihao");
223 assert_eq!(pinyin, direct);
224 }
225
226 #[test]
227 fn highlight_prefers_original_for_mixed_scripts() {
228 let mut index = InMemoryIndex::default();
229 index.add_doc(INDEX, DOC_CN, "hello 世界", true);
230
231 let hits = index.search_hits(INDEX, "hello shi");
232 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
233 panic!("expected hit for mixed script query");
234 };
235 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
236 let content = index.get_doc(INDEX, DOC_CN).unwrap();
237 let slices: Vec<String> = matches
238 .iter()
239 .map(|(s, e)| utf16_slice(&content, *s, *e))
240 .collect();
241 assert!(
242 slices.iter().any(|s| s == "hello"),
243 "expected original spans for mixed script matches, got {:?}",
244 slices
245 );
246 if slices.iter().any(|s| s.chars().any(|c| !c.is_ascii())) {
247 assert!(
248 slices.iter().any(|s| s == "世界"),
249 "expected CJK spans for mixed script matches, got {:?}",
250 slices
251 );
252 }
253 }
254
255 #[test]
256 fn pinyin_prefix_highlight_uses_original_spans() {
257 let mut index = InMemoryIndex::default();
258 index.add_doc(INDEX, DOC_CN, "你好世界", true);
259
260 let hits = index.search_hits(INDEX, "nih");
261 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
262 panic!("expected prefix pinyin hit");
263 };
264 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
265 let direct = index.get_matches(INDEX, DOC_CN, "你好");
266 assert_eq!(
267 matches, direct,
268 "prefix highlight should map back to original spans"
269 );
270 }
271
272 #[test]
273 fn pinyin_highlight_handles_trailing_ascii() {
274 let mut index = InMemoryIndex::with_position_encoding(PositionEncoding::Utf16);
275 index.add_doc(
276 INDEX,
277 DOC_CN,
278 "美光将在全球内存供应短缺之际退出消费级内存业务",
279 true,
280 );
281
282 let hits = index.search_hits(INDEX, "neicun");
283 let hit = hits
284 .iter()
285 .find(|h| h.doc_id == DOC_CN)
286 .unwrap_or_else(|| panic!("expected hit for neicun, got {:?}", hits));
287 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
288 assert!(
289 !matches.is_empty(),
290 "expected highlight spans for pinyin match, got none"
291 );
292 let content = index.get_doc(INDEX, DOC_CN).unwrap();
293 let slices: Vec<String> = matches
294 .iter()
295 .map(|(s, e)| utf16_slice(&content, *s, *e))
296 .collect();
297 assert!(
298 slices.iter().all(|s| s == "内存"),
299 "expected highlights to stay on original term, got {:?}",
300 slices
301 );
302 }
303
304 fn utf16_slice(content: &str, start: u32, end: u32) -> String {
305 let mut utf16_pos = 0u32;
306 let mut start_byte = 0usize;
307 let mut end_byte = content.len();
308 for (idx, ch) in content.char_indices() {
309 if utf16_pos == start {
310 start_byte = idx;
311 }
312 utf16_pos += ch.len_utf16() as u32;
313 if utf16_pos == end {
314 end_byte = idx + ch.len_utf8();
315 break;
316 }
317 }
318 content[start_byte..end_byte].to_string()
319 }
320
321 #[test]
322 fn exact_search_prefers_original_terms() {
323 let mut index = InMemoryIndex::default();
324 index.add_doc(INDEX, DOC_EN, "nihao greeting", true);
325 index.add_doc(INDEX, DOC_CN, "你好世界", true);
326
327 let exact_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Exact);
328 assert_contains_doc(&exact_hits, DOC_EN);
329 assert!(
330 exact_hits.iter().all(|(id, _)| id == DOC_EN),
331 "expected exact search to ignore pinyin matches, got {:?}",
332 exact_hits
333 );
334
335 let auto_hits = index.search(INDEX, "nihao");
336 assert_contains_doc(&auto_hits, DOC_EN);
337 assert!(
338 auto_hits.iter().all(|(id, _)| id != DOC_CN),
339 "auto search should stop at exact matches"
340 );
341
342 let pinyin_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Pinyin);
343 assert_contains_doc(&pinyin_hits, DOC_CN);
344 }
345
346 #[test]
347 fn japanese_ngram_search() {
348 let mut index = InMemoryIndex::default();
349 index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
350
351 let hits = index.search(INDEX, "こん");
352 assert_contains_doc(&hits, DOC_JP);
353
354 let matches = index.get_matches(INDEX, DOC_JP, "こん");
355 assert!(
356 !matches.is_empty(),
357 "expected offsets for japanese ngram matches"
358 );
359 }
360
361 #[test]
362 fn kanji_adjacent_to_kana_skips_pinyin() {
363 let mut index = InMemoryIndex::default();
364 index.add_doc(INDEX, DOC_JP, "東京へようこそ", true);
365
366 let hits = index.search_with_mode(INDEX, "dongjing", SearchMode::Pinyin);
367 assert!(
368 hits.is_empty(),
369 "kanji near kana should not derive pinyin, got {:?}",
370 hits
371 );
372 }
373
374 #[test]
375 fn exact_search_applies_minimum_should_match() {
376 let mut index = InMemoryIndex::default();
377 index.add_doc(INDEX, "doc-2-terms", "apple banana", true);
378 index.add_doc(INDEX, "doc-3-terms", "apple banana cherry", true);
379 index.add_doc(INDEX, "doc-1-term", "apple", true);
380
381 let hits = index.search_with_mode(INDEX, "apple banana cherry", SearchMode::Exact);
382
383 assert_contains_doc(&hits, "doc-2-terms");
384 assert_contains_doc(&hits, "doc-3-terms");
385 assert!(
386 !hits.iter().any(|(id, _)| id == "doc-1-term"),
387 "docs below minimum_should_match should be filtered out"
388 );
389
390 let score_two = hits
391 .iter()
392 .find(|(id, _)| id == "doc-2-terms")
393 .map(|(_, s)| *s)
394 .unwrap();
395 let score_three = hits
396 .iter()
397 .find(|(id, _)| id == "doc-3-terms")
398 .map(|(_, s)| *s)
399 .unwrap();
400 assert!(
401 score_three > score_two,
402 "more matched terms should score higher: {} vs {}",
403 score_three,
404 score_two
405 );
406 }
407
408 #[test]
409 fn pinyin_polyphonic_variants_for_short_tokens() {
410 let mut index = InMemoryIndex::default();
411 index.add_doc(INDEX, DOC_CN, "重庆火锅", true);
412
413 let hits_zhong = index.search_with_mode_hits(INDEX, "zhongqing", SearchMode::Pinyin);
414 assert!(
415 hits_zhong.iter().any(|h| h.doc_id == DOC_CN),
416 "expected zhongqing variant to hit"
417 );
418
419 let hits_chong = index.search_with_mode_hits(INDEX, "chongqing", SearchMode::Pinyin);
420 assert!(
421 hits_chong.iter().any(|h| h.doc_id == DOC_CN),
422 "expected chongqing variant to hit"
423 );
424
425 let matched_terms: Vec<MatchedTerm> = hits_zhong
426 .into_iter()
427 .find(|h| h.doc_id == DOC_CN)
428 .map(|h| h.matched_terms)
429 .unwrap_or_default();
430 assert!(
431 matched_terms
432 .iter()
433 .any(|t| t.term.contains("zhongqing") || t.term.contains("chongqing")),
434 "expected polyphonic pinyin variants in matched_terms, got {:?}",
435 matched_terms
436 );
437 }
438
439 #[test]
440 fn get_matches_for_terms_uses_matched_terms() {
441 let mut index = InMemoryIndex::default();
442 index.add_doc(INDEX, DOC_EN, "memoryIndexer", true);
443
444 let hits = index.search_hits(INDEX, "memryindexer");
445 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_EN) else {
446 panic!("expected hit for doc");
447 };
448 assert!(
449 hit.matched_terms
450 .iter()
451 .any(|t| t.term == "memoryindexer" && t.domain == TermDomain::Original),
452 "expected matched term memoryIndexer, got {:?}",
453 hit.matched_terms
454 );
455
456 let matches = index.get_matches_for_matched_terms(INDEX, DOC_EN, &hit.matched_terms);
457 assert!(!matches.is_empty(), "expected matches from matched_terms");
458 }
459
460 #[test]
461 fn fullwidth_pinyin_query_hits() {
462 let mut index = InMemoryIndex::default();
463 index.add_doc(INDEX, DOC_CN, "你好世界", true);
464
465 let hits = index.search_hits(INDEX, "NIHAO");
467 assert!(
468 hits.iter().any(|h| h.doc_id == DOC_CN),
469 "expected full-width pinyin query to hit, got {:?}",
470 hits.iter()
471 .map(|h| (&h.doc_id, &h.matched_terms))
472 .collect::<Vec<_>>()
473 );
474 let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
475 h.matched_terms
476 .iter()
477 .find(|t| t.domain == TermDomain::PinyinFull)
478 });
479 assert!(
480 matched.is_some(),
481 "expected matched pinyin full term, got {:?}",
482 hits.iter()
483 .find(|h| h.doc_id == DOC_CN)
484 .map(|h| h.matched_terms.clone())
485 );
486 }
487
488 #[test]
489 fn short_pinyin_fuzzy_hits() {
490 let mut index = InMemoryIndex::default();
491 index.add_doc(INDEX, DOC_CN, "你好", true);
492
493 let hits = index.search_hits(INDEX, "niha");
495 assert!(
496 hits.iter().any(|h| h.doc_id == DOC_CN),
497 "expected fuzzy pinyin hit for short query, got {:?}",
498 hits.iter()
499 .map(|h| (&h.doc_id, &h.matched_terms))
500 .collect::<Vec<_>>()
501 );
502 let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
503 h.matched_terms
504 .iter()
505 .find(|t| matches!(t.domain, TermDomain::PinyinFull))
506 });
507 assert!(
508 matched.is_some(),
509 "expected matched pinyin term, got {:?}",
510 hits.iter()
511 .find(|h| h.doc_id == DOC_CN)
512 .map(|h| h.matched_terms.clone())
513 );
514 }
515
516 #[test]
517 fn non_ascii_auto_fuzzy_fallback() {
518 let mut index = InMemoryIndex::default();
519 index.add_doc(INDEX, DOC_CN, "北京大学", true);
520
521 let hits = index.search_hits(INDEX, "北景大学");
523 assert!(
524 hits.iter().any(|h| h.doc_id == DOC_CN),
525 "expected non-ascii fuzzy fallback to hit, got {:?}",
526 hits.iter()
527 .map(|h| (&h.doc_id, &h.matched_terms))
528 .collect::<Vec<_>>()
529 );
530 }
531
532 #[test]
533 fn mixed_script_query_hits_all_tokens() {
534 let mut index = InMemoryIndex::default();
535 index.add_doc(INDEX, DOC_CN, "hello 世界", true);
536
537 let hits = index.search_hits(INDEX, "hello 世界");
538 assert!(
539 hits.iter().any(|h| h.doc_id == DOC_CN),
540 "expected mixed-script query to hit doc, got {:?}",
541 hits.iter()
542 .map(|h| (&h.doc_id, &h.matched_terms))
543 .collect::<Vec<_>>()
544 );
545 let matched = hits
546 .iter()
547 .find(|h| h.doc_id == DOC_CN)
548 .map(|h| h.matched_terms.clone())
549 .unwrap_or_default();
550 assert!(
551 matched
552 .iter()
553 .any(|t| t.term == "hello" && t.domain == TermDomain::Original),
554 "expected matched original term hello, got {:?}",
555 matched
556 );
557 assert!(
558 matched.iter().any(|t| t.term == "世界"),
559 "expected matched CJK term 世界, got {:?}",
560 matched
561 );
562 }
563
564 #[test]
565 fn chinese_oov_fuzzy_recall() {
566 let mut index = InMemoryIndex::default();
567 index.add_doc(INDEX, DOC_CN, "明博", true);
568
569 let hits = index.search_hits(INDEX, "明搏");
571 assert!(
572 hits.iter().any(|h| h.doc_id == DOC_CN),
573 "expected OOV chinese fuzzy to hit, got {:?}",
574 hits.iter()
575 .map(|h| (&h.doc_id, &h.matched_terms))
576 .collect::<Vec<_>>()
577 );
578 }
579
580 #[test]
581 fn load_snapshot_restores_domains_and_lengths() {
582 let mut index = InMemoryIndex::default();
583 index.add_doc(INDEX, DOC_CN, "你好世界", true);
584
585 let snapshot = index
586 .get_snapshot_data(INDEX)
587 .expect("snapshot should exist");
588 let expected_total_len = snapshot.total_len;
589 let expected_domain_len = snapshot.domain_total_len.get(TermDomain::Original);
590
591 let mut restored = InMemoryIndex::default();
592 restored.load_snapshot(INDEX, snapshot);
593
594 let hits = restored.search_hits(INDEX, "nihap");
595 assert!(
596 hits.iter().any(|hit| hit.doc_id == DOC_CN),
597 "expected restored index to serve pinyin fuzzy hits"
598 );
599 let restored_state = restored
600 .indexes
601 .get(INDEX)
602 .expect("restored index state should exist");
603 assert_eq!(restored_state.total_len, expected_total_len);
604 assert_eq!(
605 restored_state.domain_total_len.get(TermDomain::Original),
606 expected_domain_len
607 );
608 }
609
610 #[test]
611 fn has_unpersisted_changes_tracks_dirty_and_deleted() {
612 let mut index = InMemoryIndex::default();
613 assert!(!index.has_unpersisted_changes(None));
614
615 index.add_doc(INDEX, DOC_EN, "pending doc", true);
616 assert!(index.has_unpersisted_changes(Some(INDEX)));
617 assert!(index.has_unpersisted_changes(None));
618
619 index.take_dirty_and_deleted();
620 assert!(!index.has_unpersisted_changes(Some(INDEX)));
621 assert!(!index.has_unpersisted_changes(None));
622
623 index.remove_doc(INDEX, DOC_EN);
624 assert!(index.has_unpersisted_changes(Some(INDEX)));
625 assert!(index.has_unpersisted_changes(None));
626 }
627
628 #[test]
629 fn load_snapshot_clears_pending_flags() {
630 let mut index = InMemoryIndex::default();
631 index.add_doc(INDEX, DOC_EN, "snapshot doc", true);
632
633 let snapshot = index
634 .get_snapshot_data(INDEX)
635 .expect("snapshot should exist");
636 assert!(index.has_unpersisted_changes(Some(INDEX)));
637
638 index.load_snapshot(INDEX, snapshot);
639 assert!(
640 !index.has_unpersisted_changes(Some(INDEX)),
641 "loading a snapshot should reset pending persistence markers"
642 );
643 }
644
645 #[test]
646 fn persist_if_dirty_skips_when_clean() {
647 let mut index = InMemoryIndex::default();
648 let mut called = false;
649
650 let persisted = index
651 .persist_if_dirty(INDEX, |_snapshot| -> Result<(), ()> {
652 called = true;
653 Ok(())
654 })
655 .unwrap();
656
657 assert!(!persisted, "clean index should skip persistence");
658 assert!(!called, "callback should not run when skipped");
659 }
660
661 #[test]
662 fn persist_if_dirty_persists_and_marks_clean_on_success() {
663 let mut index = InMemoryIndex::default();
664 index.add_doc(INDEX, DOC_EN, "persist me", true);
665
666 let mut called = false;
667 let persisted = index
668 .persist_if_dirty(INDEX, |snapshot| -> Result<(), ()> {
669 called = true;
670 assert_eq!(snapshot.docs.len(), 1, "snapshot should include doc");
671 Ok(())
672 })
673 .unwrap();
674
675 assert!(persisted, "dirty index should persist");
676 assert!(called, "callback should run on persistence");
677 assert!(
678 !index.has_unpersisted_changes(Some(INDEX)),
679 "successful persist should mark index clean"
680 );
681 }
682
683 #[test]
684 fn persist_if_dirty_keeps_pending_on_error() {
685 let mut index = InMemoryIndex::default();
686 index.add_doc(INDEX, DOC_EN, "persist error", true);
687
688 let err = index
689 .persist_if_dirty(INDEX, |_snapshot| -> Result<(), &'static str> {
690 Err("boom")
691 })
692 .unwrap_err();
693 assert_eq!(err, "boom");
694 assert!(
695 index.has_unpersisted_changes(Some(INDEX)),
696 "failed persist should leave index dirty"
697 );
698 }
699
700 #[test]
701 fn fuzzy_msm_filters_insufficient_matches() {
702 let mut index = InMemoryIndex::default();
703 index.add_doc(INDEX, "doc-long", "apple banana", true);
704 index.add_doc(INDEX, "doc-short", "apple", true);
705
706 let hits = index.search_with_mode_hits(INDEX, "applr banaan", SearchMode::Fuzzy);
707 assert!(
708 hits.iter().any(|h| h.doc_id == "doc-long"),
709 "expected fuzzy msm to keep doc with both terms, got {:?}",
710 hits
711 );
712 assert!(
713 hits.iter().all(|h| h.doc_id != "doc-short"),
714 "docs below min_should_match should be filtered out: {:?}",
715 hits
716 );
717 }
718
719 #[test]
720 fn short_cjk_fuzzy_recall_uses_2gram() {
721 let mut index = InMemoryIndex::default();
722 index.add_doc(INDEX, "doc-short-cjk", "方案", true);
723
724 let hits = index.search_hits(INDEX, "方桉");
725 assert!(
726 hits.iter().any(|h| h.doc_id == "doc-short-cjk"),
727 "expected 2-gram fuzzy recall for short CJK tokens, got {:?}",
728 hits
729 );
730 }
731
732 #[test]
733 fn dictionary_load_and_fallback() {
734 let dir = tempdir().unwrap();
735 let path = dir.path().join("dict.json");
736
737 let mut entries = HashSet::new();
738 entries.insert("こんにちは".to_string());
739 let config = DictionaryConfig {
740 japanese: Some(ScriptDictionary {
741 version: Some("v1".to_string()),
742 entries,
743 }),
744 hangul: None,
745 };
746
747 std::fs::write(&path, serde_json::to_vec(&config).unwrap()).unwrap();
748 let loaded: DictionaryConfig =
749 serde_json::from_slice(&std::fs::read(&path).unwrap()).expect("should deserialize");
750
751 let mut index = InMemoryIndex::with_dictionary_config(loaded.clone());
752 index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
753
754 let hits = index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
755 assert!(
756 hits.iter().any(|h| h.doc_id == DOC_JP),
757 "expected dictionary-backed search hit, got {:?}",
758 hits
759 );
760 let mut fallback_index = InMemoryIndex::default();
761 fallback_index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
762 let fallback_hits =
763 fallback_index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
764 assert!(
765 fallback_hits.iter().any(|h| h.doc_id == DOC_JP),
766 "expected fallback tokenization to still recall doc, got {:?}",
767 fallback_hits
768 );
769 }
770
771 #[test]
772 fn id_like_tokens_match_exact() {
773 let mut index = InMemoryIndex::default();
774 let doc_id = "doc-id";
775 let id_like = "IKPeA9Zu9eo_pXlKWVFcf";
776
777 index.add_doc(INDEX, doc_id, id_like, true);
778
779 let hits = index.search_with_mode_hits(INDEX, id_like, SearchMode::Exact);
780 assert!(
781 hits.iter().any(|h| h.doc_id == doc_id),
782 "expected exact search to hit id-like token, got {:?}",
783 hits
784 );
785 }
786}