1mod base;
2mod index;
3mod ngram;
4mod pipeline;
5mod search;
6mod tokenizer;
7mod types;
8
9pub use types::{
10 DocData, InMemoryIndex, PositionEncoding, SNAPSHOT_VERSION, SearchHit, SearchMode,
11 SnapshotData, TermDomain,
12};
13
14pub use tokenizer::dictionary::{
15 DictionaryConfig, DictionaryLanguage, DictionarySegmenter, ScriptDictionary,
16 train_dictionary_config,
17};
18
19#[cfg(test)]
20mod tests {
21 use super::types::{DomainLengths, MatchedTerm, TermFrequency};
22 use super::*;
23 use std::collections::{HashMap, HashSet};
24 use tempfile::tempdir;
25
26 const INDEX: &str = "test-index";
27 const DOC_CN: &str = "doc-cn";
28 const DOC_EN: &str = "doc-en";
29 const DOC_JP: &str = "doc-jp";
30
31 fn assert_contains_doc(results: &[(String, f64)], doc_id: &str) {
32 assert!(
33 results.iter().any(|(id, _)| id == doc_id),
34 "expected results to contain doc {doc_id}, got {:?}",
35 results
36 );
37 }
38
39 fn domain_term_dict<'a>(
40 index: &'a InMemoryIndex,
41 domain: TermDomain,
42 ) -> Option<&'a std::collections::HashSet<String>> {
43 index
44 .domains
45 .get(INDEX)
46 .and_then(|domains| domains.get(&domain))
47 .map(|d| &d.term_dict)
48 }
49
50 fn domain_ngram_index<'a>(
51 index: &'a InMemoryIndex,
52 domain: TermDomain,
53 ) -> Option<&'a std::collections::HashMap<String, Vec<String>>> {
54 index
55 .domains
56 .get(INDEX)
57 .and_then(|domains| domains.get(&domain))
58 .map(|d| &d.ngram_index)
59 }
60
61 #[test]
62 fn chinese_full_pinyin_search() {
63 let mut index = InMemoryIndex::default();
64 index.add_doc(INDEX, DOC_CN, "你好世界", true);
65
66 let hits = index.search(INDEX, "nihao");
67 assert_contains_doc(&hits, DOC_CN);
68 }
69
70 #[test]
71 fn chinese_initials_search() {
72 let mut index = InMemoryIndex::default();
73 index.add_doc(INDEX, DOC_CN, "你好世界", true);
74
75 let hits = index.search(INDEX, "nh");
76 assert_contains_doc(&hits, DOC_CN);
77 }
78
79 #[test]
80 fn chinese_initials_prefix_search() {
81 let mut index = InMemoryIndex::default();
82 index.add_doc(INDEX, DOC_CN, "你好世界", true);
83
84 let hits = index.search(INDEX, "nhs");
85 assert_contains_doc(&hits, DOC_CN);
86
87 let exact = index.get_matches(INDEX, DOC_CN, "nhsj");
88 let prefix = index.get_matches(INDEX, DOC_CN, "nhs");
89 assert!(!exact.is_empty());
90 assert!(!prefix.is_empty());
91 assert!(
92 prefix.iter().any(|p| exact.iter().any(|e| e.0 == p.0)),
93 "prefix highlight should align to original start"
94 );
95 }
96
97 #[test]
98 fn chinese_full_pinyin_prefix_search() {
99 let mut index = InMemoryIndex::default();
100 index.add_doc(INDEX, DOC_CN, "你好世界", true);
101
102 let hits = index.search(INDEX, "nih");
103 assert_contains_doc(&hits, DOC_CN);
104
105 let exact = index.get_matches(INDEX, DOC_CN, "nihaoshijie");
106 let prefix = index.get_matches(INDEX, DOC_CN, "nih");
107 assert!(!exact.is_empty());
108 assert!(!prefix.is_empty());
109 assert!(
110 prefix.iter().any(|p| exact.iter().any(|e| e.0 == p.0)),
111 "prefix highlight should align to original start"
112 );
113 }
114
115 #[test]
116 fn pinyin_fuzzy_search() {
117 let mut index = InMemoryIndex::default();
118 index.add_doc(INDEX, DOC_CN, "你好世界", true);
119
120 let hits = index.search_hits(INDEX, "nihap");
121 assert!(
122 hits.iter()
123 .any(|h| h.doc_id == DOC_CN && !h.matched_terms.is_empty()),
124 "expected matched pinyin term in fuzzy hits: {:?}",
125 hits.iter()
126 .map(|h| (&h.doc_id, &h.matched_terms))
127 .collect::<Vec<_>>()
128 );
129
130 let fuzzy_original = index.search_with_mode(INDEX, "nihap", SearchMode::Fuzzy);
131 assert!(
132 fuzzy_original.is_empty(),
133 "expected SearchMode::Fuzzy to only search original domain, got {:?}",
134 fuzzy_original
135 );
136 }
137
138 #[test]
139 fn original_aux_index_excludes_non_ascii_terms() {
140 let mut index = InMemoryIndex::default();
141 index.add_doc(INDEX, DOC_CN, "你好世界", true);
142
143 if let Some(term_dict) = domain_term_dict(&index, TermDomain::Original) {
144 assert!(term_dict.contains("你好"));
145 assert!(term_dict.contains("世界"));
146 }
147 }
148
149 #[test]
150 fn english_fuzzy_search() {
151 let mut index = InMemoryIndex::default();
152 index.add_doc(INDEX, DOC_EN, "fuzzy search handles typos", true);
153
154 let hits = index.search_hits(INDEX, "fuzze");
155 assert!(hits.iter().any(|h| {
156 h.doc_id == DOC_EN
157 && h.matched_terms
158 .iter()
159 .any(|t| t.term == "fuzzy" && t.domain == TermDomain::Original)
160 }));
161 }
162
163 #[test]
164 fn english_query_splits_separators_and_lowercases() {
165 let mut index = InMemoryIndex::default();
166 index.add_doc(INDEX, DOC_EN, "MEMORY-INDEXER", true);
167
168 let hits = index.search_with_mode(INDEX, "memory-indexer", SearchMode::Exact);
169 assert_contains_doc(&hits, DOC_EN);
170 }
171
172 #[test]
173 fn fuzzy_search_allows_alphanumeric_terms() {
174 let mut index = InMemoryIndex::default();
175 index.add_doc(INDEX, DOC_EN, "version2 stable", true);
176
177 let hits = index.search_with_mode(INDEX, "versoin2", SearchMode::Fuzzy);
178 assert_contains_doc(&hits, DOC_EN);
179 }
180
181 #[test]
182 fn fuzzy_search_handles_separated_query_terms() {
183 let mut index = InMemoryIndex::default();
184 index.add_doc(INDEX, DOC_EN, "memory-indexer", true);
185
186 let hits = index.search_with_mode(INDEX, "memry-indexer", SearchMode::Fuzzy);
187 assert_contains_doc(&hits, DOC_EN);
188 }
189
190 #[test]
191 fn fuzzy_search_handles_short_terms() {
192 let mut index = InMemoryIndex::default();
193 index.add_doc(INDEX, DOC_EN, "go go", true);
194
195 let hits = index.search_with_mode(INDEX, "go", SearchMode::Fuzzy);
196 assert_contains_doc(&hits, DOC_EN);
197 }
198
199 #[test]
200 fn pinyin_highlight_uses_original_positions() {
201 let mut index = InMemoryIndex::default();
202 index.add_doc(INDEX, DOC_CN, "你好世界", true);
203
204 let direct = index.get_matches(INDEX, DOC_CN, "你好");
205 assert!(
206 !direct.is_empty(),
207 "expected direct chinese match to have positions"
208 );
209
210 let pinyin = index.get_matches(INDEX, DOC_CN, "nihao");
211 assert_eq!(pinyin, direct);
212 }
213
214 #[test]
215 fn highlight_prefers_original_for_mixed_scripts() {
216 let mut index = InMemoryIndex::default();
217 index.add_doc(INDEX, DOC_CN, "hello 世界", true);
218
219 let hits = index.search_hits(INDEX, "hello shi");
220 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
221 panic!("expected hit for mixed script query");
222 };
223 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
224 let content = index.get_doc(INDEX, DOC_CN).unwrap();
225 let slices: Vec<String> = matches
226 .iter()
227 .map(|(s, e)| utf16_slice(&content, *s, *e))
228 .collect();
229 assert!(
230 slices.iter().any(|s| s == "hello"),
231 "expected original spans for mixed script matches, got {:?}",
232 slices
233 );
234 if slices.iter().any(|s| s.chars().any(|c| !c.is_ascii())) {
235 assert!(
236 slices.iter().any(|s| s == "世界"),
237 "expected CJK spans for mixed script matches, got {:?}",
238 slices
239 );
240 }
241 }
242
243 #[test]
244 fn pinyin_prefix_highlight_uses_original_spans() {
245 let mut index = InMemoryIndex::default();
246 index.add_doc(INDEX, DOC_CN, "你好世界", true);
247
248 let hits = index.search_hits(INDEX, "nih");
249 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
250 panic!("expected prefix pinyin hit");
251 };
252 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
253 let direct = index.get_matches(INDEX, DOC_CN, "你好");
254 assert_eq!(
255 matches, direct,
256 "prefix highlight should map back to original spans"
257 );
258 }
259
260 #[test]
261 fn pinyin_highlight_handles_trailing_ascii() {
262 let mut index = InMemoryIndex::with_position_encoding(PositionEncoding::Utf16);
263 index.add_doc(
264 INDEX,
265 DOC_CN,
266 "美光将在全球内存供应短缺之际退出消费级内存业务",
267 true,
268 );
269
270 let hits = index.search_hits(INDEX, "neicun");
271 let hit = hits
272 .iter()
273 .find(|h| h.doc_id == DOC_CN)
274 .unwrap_or_else(|| panic!("expected hit for neicun, got {:?}", hits));
275 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
276 assert!(
277 !matches.is_empty(),
278 "expected highlight spans for pinyin match, got none"
279 );
280 let content = index.get_doc(INDEX, DOC_CN).unwrap();
281 let slices: Vec<String> = matches
282 .iter()
283 .map(|(s, e)| utf16_slice(&content, *s, *e))
284 .collect();
285 assert!(
286 slices.iter().all(|s| s == "内存"),
287 "expected highlights to stay on original term, got {:?}",
288 slices
289 );
290 }
291
292 fn utf16_slice(content: &str, start: u32, end: u32) -> String {
293 let mut utf16_pos = 0u32;
294 let mut start_byte = 0usize;
295 let mut end_byte = content.len();
296 for (idx, ch) in content.char_indices() {
297 if utf16_pos == start {
298 start_byte = idx;
299 }
300 utf16_pos += ch.len_utf16() as u32;
301 if utf16_pos == end {
302 end_byte = idx + ch.len_utf8();
303 break;
304 }
305 }
306 content[start_byte..end_byte].to_string()
307 }
308
309 #[test]
310 fn exact_search_prefers_original_terms() {
311 let mut index = InMemoryIndex::default();
312 index.add_doc(INDEX, DOC_EN, "nihao greeting", true);
313 index.add_doc(INDEX, DOC_CN, "你好世界", true);
314
315 let exact_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Exact);
316 assert_contains_doc(&exact_hits, DOC_EN);
317 assert!(
318 exact_hits.iter().all(|(id, _)| id == DOC_EN),
319 "expected exact search to ignore pinyin matches, got {:?}",
320 exact_hits
321 );
322
323 let auto_hits = index.search(INDEX, "nihao");
324 assert_contains_doc(&auto_hits, DOC_EN);
325 assert!(
326 auto_hits.iter().all(|(id, _)| id != DOC_CN),
327 "auto search should stop at exact matches"
328 );
329
330 let pinyin_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Pinyin);
331 assert_contains_doc(&pinyin_hits, DOC_CN);
332 }
333
334 #[test]
335 fn japanese_ngram_search() {
336 let mut index = InMemoryIndex::default();
337 index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
338
339 let hits = index.search(INDEX, "こん");
340 assert_contains_doc(&hits, DOC_JP);
341
342 let matches = index.get_matches(INDEX, DOC_JP, "こん");
343 assert!(
344 !matches.is_empty(),
345 "expected offsets for japanese ngram matches"
346 );
347 }
348
349 #[test]
350 fn kanji_adjacent_to_kana_skips_pinyin() {
351 let mut index = InMemoryIndex::default();
352 index.add_doc(INDEX, DOC_JP, "東京へようこそ", true);
353
354 let hits = index.search_with_mode(INDEX, "dongjing", SearchMode::Pinyin);
355 assert!(
356 hits.is_empty(),
357 "kanji near kana should not derive pinyin, got {:?}",
358 hits
359 );
360 }
361
362 #[test]
363 fn exact_search_applies_minimum_should_match() {
364 let mut index = InMemoryIndex::default();
365 index.add_doc(INDEX, "doc-2-terms", "apple banana", true);
366 index.add_doc(INDEX, "doc-3-terms", "apple banana cherry", true);
367 index.add_doc(INDEX, "doc-1-term", "apple", true);
368
369 let hits = index.search_with_mode(INDEX, "apple banana cherry", SearchMode::Exact);
370
371 assert_contains_doc(&hits, "doc-2-terms");
372 assert_contains_doc(&hits, "doc-3-terms");
373 assert!(
374 !hits.iter().any(|(id, _)| id == "doc-1-term"),
375 "docs below minimum_should_match should be filtered out"
376 );
377
378 let score_two = hits
379 .iter()
380 .find(|(id, _)| id == "doc-2-terms")
381 .map(|(_, s)| *s)
382 .unwrap();
383 let score_three = hits
384 .iter()
385 .find(|(id, _)| id == "doc-3-terms")
386 .map(|(_, s)| *s)
387 .unwrap();
388 assert!(
389 score_three > score_two,
390 "more matched terms should score higher: {} vs {}",
391 score_three,
392 score_two
393 );
394 }
395
396 #[test]
397 fn pinyin_polyphonic_variants_for_short_tokens() {
398 let mut index = InMemoryIndex::default();
399 index.add_doc(INDEX, DOC_CN, "重庆火锅", true);
400
401 let hits_zhong = index.search_with_mode_hits(INDEX, "zhongqing", SearchMode::Pinyin);
402 assert!(
403 hits_zhong.iter().any(|h| h.doc_id == DOC_CN),
404 "expected zhongqing variant to hit"
405 );
406
407 let hits_chong = index.search_with_mode_hits(INDEX, "chongqing", SearchMode::Pinyin);
408 assert!(
409 hits_chong.iter().any(|h| h.doc_id == DOC_CN),
410 "expected chongqing variant to hit"
411 );
412
413 let matched_terms: Vec<MatchedTerm> = hits_zhong
414 .into_iter()
415 .find(|h| h.doc_id == DOC_CN)
416 .map(|h| h.matched_terms)
417 .unwrap_or_default();
418 assert!(
419 matched_terms
420 .iter()
421 .any(|t| t.term.contains("zhongqing") || t.term.contains("chongqing")),
422 "expected polyphonic pinyin variants in matched_terms, got {:?}",
423 matched_terms
424 );
425 }
426
427 #[test]
428 fn removing_doc_cleans_aux_indices() {
429 let mut index = InMemoryIndex::default();
430 index.add_doc(INDEX, DOC_EN, "token removal check", true);
431
432 index.remove_doc(INDEX, DOC_EN);
433
434 if let Some(term_dict) = domain_term_dict(&index, TermDomain::Original) {
435 assert!(
436 !term_dict.contains("token"),
437 "term_dict should drop removed terms"
438 );
439 }
440
441 if let Some(ngram_index) = domain_ngram_index(&index, TermDomain::Original) {
442 let still_contains = ngram_index
443 .values()
444 .any(|terms| terms.iter().any(|term| term == "token"));
445 assert!(!still_contains, "ngrams should remove term entries");
446 }
447 }
448
449 #[test]
450 fn get_matches_for_terms_uses_matched_terms() {
451 let mut index = InMemoryIndex::default();
452 index.add_doc(INDEX, DOC_EN, "memoryIndexer", true);
453
454 let hits = index.search_hits(INDEX, "memryindexer");
455 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_EN) else {
456 panic!("expected hit for doc");
457 };
458 assert!(
459 hit.matched_terms
460 .iter()
461 .any(|t| t.term == "memoryindexer" && t.domain == TermDomain::Original),
462 "expected matched term memoryIndexer, got {:?}",
463 hit.matched_terms
464 );
465
466 let matches = index.get_matches_for_matched_terms(INDEX, DOC_EN, &hit.matched_terms);
467 assert!(!matches.is_empty(), "expected matches from matched_terms");
468 }
469
470 #[test]
471 fn snapshot_contains_aux_indices_per_domain() {
472 let mut index = InMemoryIndex::default();
473 index.add_doc(INDEX, DOC_CN, "你好世界 memory-indexer", true);
474
475 let snapshot = index
476 .get_snapshot_data(INDEX)
477 .expect("snapshot should exist");
478
479 let domains = snapshot.domains;
480 let original = domains
481 .get(&TermDomain::Original)
482 .expect("snapshot should contain original domain");
483 assert!(
484 !original.term_dict.is_empty(),
485 "expected original aux index to be persisted"
486 );
487 let pinyin_full = domains
488 .get(&TermDomain::PinyinFull)
489 .expect("snapshot should contain pinyin full domain");
490 assert!(
491 !pinyin_full.term_dict.is_empty(),
492 "expected full pinyin aux index to be persisted"
493 );
494 let pinyin_initials = domains
495 .get(&TermDomain::PinyinInitials)
496 .expect("snapshot should contain pinyin initials domain");
497 assert!(
498 !pinyin_initials.term_dict.is_empty(),
499 "expected initials pinyin aux index to be persisted"
500 );
501 assert!(
502 !pinyin_full.ngram_index.is_empty(),
503 "expected pinyin ngram index to be persisted"
504 );
505 }
506
507 #[test]
508 fn fullwidth_pinyin_query_hits() {
509 let mut index = InMemoryIndex::default();
510 index.add_doc(INDEX, DOC_CN, "你好世界", true);
511
512 let hits = index.search_hits(INDEX, "NIHAO");
514 assert!(
515 hits.iter().any(|h| h.doc_id == DOC_CN),
516 "expected full-width pinyin query to hit, got {:?}",
517 hits.iter()
518 .map(|h| (&h.doc_id, &h.matched_terms))
519 .collect::<Vec<_>>()
520 );
521 let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
522 h.matched_terms
523 .iter()
524 .find(|t| t.domain == TermDomain::PinyinFull)
525 });
526 assert!(
527 matched.is_some(),
528 "expected matched pinyin full term, got {:?}",
529 hits.iter()
530 .find(|h| h.doc_id == DOC_CN)
531 .map(|h| h.matched_terms.clone())
532 );
533 }
534
535 #[test]
536 fn short_pinyin_fuzzy_hits() {
537 let mut index = InMemoryIndex::default();
538 index.add_doc(INDEX, DOC_CN, "你好", true);
539
540 let hits = index.search_hits(INDEX, "niha");
542 assert!(
543 hits.iter().any(|h| h.doc_id == DOC_CN),
544 "expected fuzzy pinyin hit for short query, got {:?}",
545 hits.iter()
546 .map(|h| (&h.doc_id, &h.matched_terms))
547 .collect::<Vec<_>>()
548 );
549 let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
550 h.matched_terms.iter().find(|t| {
551 matches!(
552 t.domain,
553 TermDomain::PinyinFull | TermDomain::PinyinFullPrefix
554 )
555 })
556 });
557 assert!(
558 matched.is_some(),
559 "expected matched pinyin term, got {:?}",
560 hits.iter()
561 .find(|h| h.doc_id == DOC_CN)
562 .map(|h| h.matched_terms.clone())
563 );
564 }
565
566 #[test]
567 fn non_ascii_auto_fuzzy_fallback() {
568 let mut index = InMemoryIndex::default();
569 index.add_doc(INDEX, DOC_CN, "北京大学", true);
570
571 let hits = index.search_hits(INDEX, "北景大学");
573 assert!(
574 hits.iter().any(|h| h.doc_id == DOC_CN),
575 "expected non-ascii fuzzy fallback to hit, got {:?}",
576 hits.iter()
577 .map(|h| (&h.doc_id, &h.matched_terms))
578 .collect::<Vec<_>>()
579 );
580 }
581
582 #[test]
583 fn mixed_script_query_hits_all_tokens() {
584 let mut index = InMemoryIndex::default();
585 index.add_doc(INDEX, DOC_CN, "hello 世界", true);
586
587 let hits = index.search_hits(INDEX, "hello 世界");
588 assert!(
589 hits.iter().any(|h| h.doc_id == DOC_CN),
590 "expected mixed-script query to hit doc, got {:?}",
591 hits.iter()
592 .map(|h| (&h.doc_id, &h.matched_terms))
593 .collect::<Vec<_>>()
594 );
595 let matched = hits
596 .iter()
597 .find(|h| h.doc_id == DOC_CN)
598 .map(|h| h.matched_terms.clone())
599 .unwrap_or_default();
600 assert!(
601 matched
602 .iter()
603 .any(|t| t.term == "hello" && t.domain == TermDomain::Original),
604 "expected matched original term hello, got {:?}",
605 matched
606 );
607 assert!(
608 matched.iter().any(|t| t.term == "世界"),
609 "expected matched CJK term 世界, got {:?}",
610 matched
611 );
612 }
613
614 #[test]
615 fn chinese_oov_fuzzy_recall() {
616 let mut index = InMemoryIndex::default();
617 index.add_doc(INDEX, DOC_CN, "明博", true);
618
619 let hits = index.search_hits(INDEX, "明搏");
621 assert!(
622 hits.iter().any(|h| h.doc_id == DOC_CN),
623 "expected OOV chinese fuzzy to hit, got {:?}",
624 hits.iter()
625 .map(|h| (&h.doc_id, &h.matched_terms))
626 .collect::<Vec<_>>()
627 );
628 }
629
630 #[test]
631 fn load_snapshot_rebuilds_missing_aux_indices() {
632 let mut index = InMemoryIndex::default();
633 index.add_doc(INDEX, DOC_CN, "你好世界", true);
634
635 let mut snapshot = index
636 .get_snapshot_data(INDEX)
637 .expect("snapshot should exist");
638 if let Some(full) = snapshot.domains.get_mut(&TermDomain::PinyinFull) {
639 full.term_dict.clear();
640 full.ngram_index.clear();
641 }
642 if let Some(initials) = snapshot.domains.get_mut(&TermDomain::PinyinInitials) {
643 initials.term_dict.clear();
644 initials.ngram_index.clear();
645 }
646
647 let mut restored = InMemoryIndex::default();
648 restored.load_snapshot(INDEX, snapshot);
649
650 let hits = restored.search_hits(INDEX, "nihap");
651 assert!(
652 hits.iter().any(|hit| hit.doc_id == DOC_CN),
653 "expected rebuilt pinyin aux indices to allow fuzzy hits"
654 );
655 assert!(
656 restored
657 .domains
658 .get(INDEX)
659 .and_then(|domains| domains.get(&TermDomain::PinyinFull))
660 .is_some_and(|d| !d.term_dict.is_empty()),
661 "expected pinyin full dictionary to be rebuilt from doc data"
662 );
663 assert!(
664 restored
665 .domains
666 .get(INDEX)
667 .and_then(|domains| domains.get(&TermDomain::PinyinInitials))
668 .is_some_and(|d| !d.term_dict.is_empty()),
669 "expected pinyin initials dictionary to be rebuilt from doc data"
670 );
671 }
672
673 #[test]
674 fn fuzzy_msm_filters_insufficient_matches() {
675 let mut index = InMemoryIndex::default();
676 index.add_doc(INDEX, "doc-long", "apple banana", true);
677 index.add_doc(INDEX, "doc-short", "apple", true);
678
679 let hits = index.search_with_mode_hits(INDEX, "applr banaan", SearchMode::Fuzzy);
680 assert!(
681 hits.iter().any(|h| h.doc_id == "doc-long"),
682 "expected fuzzy msm to keep doc with both terms, got {:?}",
683 hits
684 );
685 assert!(
686 hits.iter().all(|h| h.doc_id != "doc-short"),
687 "docs below min_should_match should be filtered out: {:?}",
688 hits
689 );
690 }
691
692 #[test]
693 fn short_cjk_fuzzy_recall_uses_2gram() {
694 let mut index = InMemoryIndex::default();
695 index.add_doc(INDEX, "doc-short-cjk", "方案", true);
696
697 let hits = index.search_hits(INDEX, "方桉");
698 assert!(
699 hits.iter().any(|h| h.doc_id == "doc-short-cjk"),
700 "expected 2-gram fuzzy recall for short CJK tokens, got {:?}",
701 hits
702 );
703 }
704
705 #[test]
706 fn snapshot_v2_rebuilds_derived_spans() {
707 let mut term_pos: HashMap<String, Vec<(u32, u32)>> = HashMap::new();
708 term_pos.insert("你好".to_string(), vec![(0, 6)]);
709 term_pos.insert("nihao".to_string(), vec![(0, 6)]);
710
711 let mut term_freqs: HashMap<String, TermFrequency> = HashMap::new();
712 let mut freq_original = TermFrequency::default();
713 freq_original.increment(TermDomain::Original);
714 term_freqs.insert("你好".to_string(), freq_original);
715 let mut freq_pinyin = TermFrequency::default();
716 freq_pinyin.increment(TermDomain::PinyinFull);
717 term_freqs.insert("nihao".to_string(), freq_pinyin);
718
719 let mut docs = HashMap::new();
720 docs.insert(
721 DOC_CN.to_string(),
722 DocData {
723 content: "你好".to_string(),
724 doc_len: 2,
725 term_pos,
726 term_freqs,
727 domain_doc_len: DomainLengths::default(),
728 derived_terms: HashMap::new(),
729 },
730 );
731
732 let snapshot = SnapshotData {
733 version: 2,
734 docs,
735 domains: HashMap::new(),
736 };
737
738 let mut index = InMemoryIndex::default();
739 index.load_snapshot(INDEX, snapshot);
740
741 let hits = index.search_hits(INDEX, "nihao");
742 assert!(
743 hits.iter().any(|h| h.doc_id == DOC_CN),
744 "expected legacy snapshot to rebuild pinyin hits, got {:?}",
745 hits
746 );
747
748 let matches = index.get_matches(INDEX, DOC_CN, "nihao");
749 assert!(
750 matches.iter().any(|(s, e)| (*s, *e) == (0, 2)),
751 "expected derived spans converted to utf16, got {:?}",
752 matches
753 );
754 }
755
756 #[test]
757 fn dictionary_load_and_fallback() {
758 let dir = tempdir().unwrap();
759 let path = dir.path().join("dict.json");
760
761 let mut entries = HashSet::new();
762 entries.insert("こんにちは".to_string());
763 let config = DictionaryConfig {
764 japanese: Some(ScriptDictionary {
765 version: Some("v1".to_string()),
766 entries,
767 }),
768 hangul: None,
769 };
770
771 std::fs::write(&path, serde_json::to_vec(&config).unwrap()).unwrap();
772 let loaded: DictionaryConfig =
773 serde_json::from_slice(&std::fs::read(&path).unwrap()).expect("should deserialize");
774
775 let mut index = InMemoryIndex::with_dictionary_config(loaded.clone());
776 index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
777
778 let hits = index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
779 assert!(
780 hits.iter().any(|h| h.doc_id == DOC_JP),
781 "expected dictionary-backed search hit, got {:?}",
782 hits
783 );
784 if let Some(dict) = domain_term_dict(&index, TermDomain::Original) {
785 assert!(
786 dict.contains("こんにちは"),
787 "expected dictionary tokens to be indexed, got {:?}",
788 dict
789 );
790 }
791
792 let mut fallback_index = InMemoryIndex::default();
793 fallback_index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
794 let fallback_hits =
795 fallback_index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
796 assert!(
797 fallback_hits.iter().any(|h| h.doc_id == DOC_JP),
798 "expected fallback tokenization to still recall doc, got {:?}",
799 fallback_hits
800 );
801 }
802}