memory_indexer/
lib.rs

1mod base;
2mod index;
3mod ngram;
4mod pipeline;
5mod search;
6mod tokenizer;
7mod types;
8
9pub use types::{
10    DocData, InMemoryIndex, PositionEncoding, SNAPSHOT_VERSION, SearchHit, SearchMode,
11    SnapshotData, TermDomain,
12};
13
14pub use tokenizer::dictionary::{
15    DictionaryConfig, DictionaryLanguage, DictionarySegmenter, ScriptDictionary,
16    train_dictionary_config,
17};
18
19#[cfg(test)]
20mod tests {
21    use super::types::{DomainLengths, MatchedTerm, TermFrequency};
22    use super::*;
23    use std::collections::{HashMap, HashSet};
24    use tempfile::tempdir;
25
26    const INDEX: &str = "test-index";
27    const DOC_CN: &str = "doc-cn";
28    const DOC_EN: &str = "doc-en";
29    const DOC_JP: &str = "doc-jp";
30
31    fn assert_contains_doc(results: &[(String, f64)], doc_id: &str) {
32        assert!(
33            results.iter().any(|(id, _)| id == doc_id),
34            "expected results to contain doc {doc_id}, got {:?}",
35            results
36        );
37    }
38
39    fn domain_term_dict<'a>(
40        index: &'a InMemoryIndex,
41        domain: TermDomain,
42    ) -> Option<&'a std::collections::HashSet<String>> {
43        index
44            .domains
45            .get(INDEX)
46            .and_then(|domains| domains.get(&domain))
47            .map(|d| &d.term_dict)
48    }
49
50    fn domain_ngram_index<'a>(
51        index: &'a InMemoryIndex,
52        domain: TermDomain,
53    ) -> Option<&'a std::collections::HashMap<String, Vec<String>>> {
54        index
55            .domains
56            .get(INDEX)
57            .and_then(|domains| domains.get(&domain))
58            .map(|d| &d.ngram_index)
59    }
60
61    #[test]
62    fn chinese_full_pinyin_search() {
63        let mut index = InMemoryIndex::default();
64        index.add_doc(INDEX, DOC_CN, "你好世界", true);
65
66        let hits = index.search(INDEX, "nihao");
67        assert_contains_doc(&hits, DOC_CN);
68    }
69
70    #[test]
71    fn chinese_initials_search() {
72        let mut index = InMemoryIndex::default();
73        index.add_doc(INDEX, DOC_CN, "你好世界", true);
74
75        let hits = index.search(INDEX, "nh");
76        assert_contains_doc(&hits, DOC_CN);
77    }
78
79    #[test]
80    fn chinese_initials_prefix_search() {
81        let mut index = InMemoryIndex::default();
82        index.add_doc(INDEX, DOC_CN, "你好世界", true);
83
84        let hits = index.search(INDEX, "nhs");
85        assert_contains_doc(&hits, DOC_CN);
86
87        let exact = index.get_matches(INDEX, DOC_CN, "nhsj");
88        let prefix = index.get_matches(INDEX, DOC_CN, "nhs");
89        assert!(!exact.is_empty());
90        assert!(!prefix.is_empty());
91        assert!(
92            prefix.iter().any(|p| exact.iter().any(|e| e.0 == p.0)),
93            "prefix highlight should align to original start"
94        );
95    }
96
97    #[test]
98    fn chinese_full_pinyin_prefix_search() {
99        let mut index = InMemoryIndex::default();
100        index.add_doc(INDEX, DOC_CN, "你好世界", true);
101
102        let hits = index.search(INDEX, "nih");
103        assert_contains_doc(&hits, DOC_CN);
104
105        let exact = index.get_matches(INDEX, DOC_CN, "nihaoshijie");
106        let prefix = index.get_matches(INDEX, DOC_CN, "nih");
107        assert!(!exact.is_empty());
108        assert!(!prefix.is_empty());
109        assert!(
110            prefix.iter().any(|p| exact.iter().any(|e| e.0 == p.0)),
111            "prefix highlight should align to original start"
112        );
113    }
114
115    #[test]
116    fn pinyin_fuzzy_search() {
117        let mut index = InMemoryIndex::default();
118        index.add_doc(INDEX, DOC_CN, "你好世界", true);
119
120        let hits = index.search_hits(INDEX, "nihap");
121        assert!(
122            hits.iter()
123                .any(|h| h.doc_id == DOC_CN && !h.matched_terms.is_empty()),
124            "expected matched pinyin term in fuzzy hits: {:?}",
125            hits.iter()
126                .map(|h| (&h.doc_id, &h.matched_terms))
127                .collect::<Vec<_>>()
128        );
129
130        let fuzzy_original = index.search_with_mode(INDEX, "nihap", SearchMode::Fuzzy);
131        assert!(
132            fuzzy_original.is_empty(),
133            "expected SearchMode::Fuzzy to only search original domain, got {:?}",
134            fuzzy_original
135        );
136    }
137
138    #[test]
139    fn original_aux_index_excludes_non_ascii_terms() {
140        let mut index = InMemoryIndex::default();
141        index.add_doc(INDEX, DOC_CN, "你好世界", true);
142
143        if let Some(term_dict) = domain_term_dict(&index, TermDomain::Original) {
144            assert!(term_dict.contains("你好"));
145            assert!(term_dict.contains("世界"));
146        }
147    }
148
149    #[test]
150    fn english_fuzzy_search() {
151        let mut index = InMemoryIndex::default();
152        index.add_doc(INDEX, DOC_EN, "fuzzy search handles typos", true);
153
154        let hits = index.search_hits(INDEX, "fuzze");
155        assert!(hits.iter().any(|h| {
156            h.doc_id == DOC_EN
157                && h.matched_terms
158                    .iter()
159                    .any(|t| t.term == "fuzzy" && t.domain == TermDomain::Original)
160        }));
161    }
162
163    #[test]
164    fn english_query_splits_separators_and_lowercases() {
165        let mut index = InMemoryIndex::default();
166        index.add_doc(INDEX, DOC_EN, "MEMORY-INDEXER", true);
167
168        let hits = index.search_with_mode(INDEX, "memory-indexer", SearchMode::Exact);
169        assert_contains_doc(&hits, DOC_EN);
170    }
171
172    #[test]
173    fn fuzzy_search_allows_alphanumeric_terms() {
174        let mut index = InMemoryIndex::default();
175        index.add_doc(INDEX, DOC_EN, "version2 stable", true);
176
177        let hits = index.search_with_mode(INDEX, "versoin2", SearchMode::Fuzzy);
178        assert_contains_doc(&hits, DOC_EN);
179    }
180
181    #[test]
182    fn fuzzy_search_handles_separated_query_terms() {
183        let mut index = InMemoryIndex::default();
184        index.add_doc(INDEX, DOC_EN, "memory-indexer", true);
185
186        let hits = index.search_with_mode(INDEX, "memry-indexer", SearchMode::Fuzzy);
187        assert_contains_doc(&hits, DOC_EN);
188    }
189
190    #[test]
191    fn fuzzy_search_handles_short_terms() {
192        let mut index = InMemoryIndex::default();
193        index.add_doc(INDEX, DOC_EN, "go go", true);
194
195        let hits = index.search_with_mode(INDEX, "go", SearchMode::Fuzzy);
196        assert_contains_doc(&hits, DOC_EN);
197    }
198
199    #[test]
200    fn pinyin_highlight_uses_original_positions() {
201        let mut index = InMemoryIndex::default();
202        index.add_doc(INDEX, DOC_CN, "你好世界", true);
203
204        let direct = index.get_matches(INDEX, DOC_CN, "你好");
205        assert!(
206            !direct.is_empty(),
207            "expected direct chinese match to have positions"
208        );
209
210        let pinyin = index.get_matches(INDEX, DOC_CN, "nihao");
211        assert_eq!(pinyin, direct);
212    }
213
214    #[test]
215    fn highlight_prefers_original_for_mixed_scripts() {
216        let mut index = InMemoryIndex::default();
217        index.add_doc(INDEX, DOC_CN, "hello 世界", true);
218
219        let hits = index.search_hits(INDEX, "hello shi");
220        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
221            panic!("expected hit for mixed script query");
222        };
223        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
224        let content = index.get_doc(INDEX, DOC_CN).unwrap();
225        let slices: Vec<String> = matches
226            .iter()
227            .map(|(s, e)| utf16_slice(&content, *s, *e))
228            .collect();
229        assert!(
230            slices.iter().any(|s| s == "hello"),
231            "expected original spans for mixed script matches, got {:?}",
232            slices
233        );
234        if slices.iter().any(|s| s.chars().any(|c| !c.is_ascii())) {
235            assert!(
236                slices.iter().any(|s| s == "世界"),
237                "expected CJK spans for mixed script matches, got {:?}",
238                slices
239            );
240        }
241    }
242
243    #[test]
244    fn pinyin_prefix_highlight_uses_original_spans() {
245        let mut index = InMemoryIndex::default();
246        index.add_doc(INDEX, DOC_CN, "你好世界", true);
247
248        let hits = index.search_hits(INDEX, "nih");
249        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
250            panic!("expected prefix pinyin hit");
251        };
252        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
253        let direct = index.get_matches(INDEX, DOC_CN, "你好");
254        assert_eq!(
255            matches, direct,
256            "prefix highlight should map back to original spans"
257        );
258    }
259
260    #[test]
261    fn pinyin_highlight_handles_trailing_ascii() {
262        let mut index = InMemoryIndex::with_position_encoding(PositionEncoding::Utf16);
263        index.add_doc(
264            INDEX,
265            DOC_CN,
266            "美光将在全球内存供应短缺之际退出消费级内存业务",
267            true,
268        );
269
270        let hits = index.search_hits(INDEX, "neicun");
271        let hit = hits
272            .iter()
273            .find(|h| h.doc_id == DOC_CN)
274            .unwrap_or_else(|| panic!("expected hit for neicun, got {:?}", hits));
275        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
276        assert!(
277            !matches.is_empty(),
278            "expected highlight spans for pinyin match, got none"
279        );
280        let content = index.get_doc(INDEX, DOC_CN).unwrap();
281        let slices: Vec<String> = matches
282            .iter()
283            .map(|(s, e)| utf16_slice(&content, *s, *e))
284            .collect();
285        assert!(
286            slices.iter().all(|s| s == "内存"),
287            "expected highlights to stay on original term, got {:?}",
288            slices
289        );
290    }
291
292    fn utf16_slice(content: &str, start: u32, end: u32) -> String {
293        let mut utf16_pos = 0u32;
294        let mut start_byte = 0usize;
295        let mut end_byte = content.len();
296        for (idx, ch) in content.char_indices() {
297            if utf16_pos == start {
298                start_byte = idx;
299            }
300            utf16_pos += ch.len_utf16() as u32;
301            if utf16_pos == end {
302                end_byte = idx + ch.len_utf8();
303                break;
304            }
305        }
306        content[start_byte..end_byte].to_string()
307    }
308
309    #[test]
310    fn exact_search_prefers_original_terms() {
311        let mut index = InMemoryIndex::default();
312        index.add_doc(INDEX, DOC_EN, "nihao greeting", true);
313        index.add_doc(INDEX, DOC_CN, "你好世界", true);
314
315        let exact_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Exact);
316        assert_contains_doc(&exact_hits, DOC_EN);
317        assert!(
318            exact_hits.iter().all(|(id, _)| id == DOC_EN),
319            "expected exact search to ignore pinyin matches, got {:?}",
320            exact_hits
321        );
322
323        let auto_hits = index.search(INDEX, "nihao");
324        assert_contains_doc(&auto_hits, DOC_EN);
325        assert!(
326            auto_hits.iter().all(|(id, _)| id != DOC_CN),
327            "auto search should stop at exact matches"
328        );
329
330        let pinyin_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Pinyin);
331        assert_contains_doc(&pinyin_hits, DOC_CN);
332    }
333
334    #[test]
335    fn japanese_ngram_search() {
336        let mut index = InMemoryIndex::default();
337        index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
338
339        let hits = index.search(INDEX, "こん");
340        assert_contains_doc(&hits, DOC_JP);
341
342        let matches = index.get_matches(INDEX, DOC_JP, "こん");
343        assert!(
344            !matches.is_empty(),
345            "expected offsets for japanese ngram matches"
346        );
347    }
348
349    #[test]
350    fn kanji_adjacent_to_kana_skips_pinyin() {
351        let mut index = InMemoryIndex::default();
352        index.add_doc(INDEX, DOC_JP, "東京へようこそ", true);
353
354        let hits = index.search_with_mode(INDEX, "dongjing", SearchMode::Pinyin);
355        assert!(
356            hits.is_empty(),
357            "kanji near kana should not derive pinyin, got {:?}",
358            hits
359        );
360    }
361
362    #[test]
363    fn exact_search_applies_minimum_should_match() {
364        let mut index = InMemoryIndex::default();
365        index.add_doc(INDEX, "doc-2-terms", "apple banana", true);
366        index.add_doc(INDEX, "doc-3-terms", "apple banana cherry", true);
367        index.add_doc(INDEX, "doc-1-term", "apple", true);
368
369        let hits = index.search_with_mode(INDEX, "apple banana cherry", SearchMode::Exact);
370
371        assert_contains_doc(&hits, "doc-2-terms");
372        assert_contains_doc(&hits, "doc-3-terms");
373        assert!(
374            !hits.iter().any(|(id, _)| id == "doc-1-term"),
375            "docs below minimum_should_match should be filtered out"
376        );
377
378        let score_two = hits
379            .iter()
380            .find(|(id, _)| id == "doc-2-terms")
381            .map(|(_, s)| *s)
382            .unwrap();
383        let score_three = hits
384            .iter()
385            .find(|(id, _)| id == "doc-3-terms")
386            .map(|(_, s)| *s)
387            .unwrap();
388        assert!(
389            score_three > score_two,
390            "more matched terms should score higher: {} vs {}",
391            score_three,
392            score_two
393        );
394    }
395
396    #[test]
397    fn pinyin_polyphonic_variants_for_short_tokens() {
398        let mut index = InMemoryIndex::default();
399        index.add_doc(INDEX, DOC_CN, "重庆火锅", true);
400
401        let hits_zhong = index.search_with_mode_hits(INDEX, "zhongqing", SearchMode::Pinyin);
402        assert!(
403            hits_zhong.iter().any(|h| h.doc_id == DOC_CN),
404            "expected zhongqing variant to hit"
405        );
406
407        let hits_chong = index.search_with_mode_hits(INDEX, "chongqing", SearchMode::Pinyin);
408        assert!(
409            hits_chong.iter().any(|h| h.doc_id == DOC_CN),
410            "expected chongqing variant to hit"
411        );
412
413        let matched_terms: Vec<MatchedTerm> = hits_zhong
414            .into_iter()
415            .find(|h| h.doc_id == DOC_CN)
416            .map(|h| h.matched_terms)
417            .unwrap_or_default();
418        assert!(
419            matched_terms
420                .iter()
421                .any(|t| t.term.contains("zhongqing") || t.term.contains("chongqing")),
422            "expected polyphonic pinyin variants in matched_terms, got {:?}",
423            matched_terms
424        );
425    }
426
427    #[test]
428    fn removing_doc_cleans_aux_indices() {
429        let mut index = InMemoryIndex::default();
430        index.add_doc(INDEX, DOC_EN, "token removal check", true);
431
432        index.remove_doc(INDEX, DOC_EN);
433
434        if let Some(term_dict) = domain_term_dict(&index, TermDomain::Original) {
435            assert!(
436                !term_dict.contains("token"),
437                "term_dict should drop removed terms"
438            );
439        }
440
441        if let Some(ngram_index) = domain_ngram_index(&index, TermDomain::Original) {
442            let still_contains = ngram_index
443                .values()
444                .any(|terms| terms.iter().any(|term| term == "token"));
445            assert!(!still_contains, "ngrams should remove term entries");
446        }
447    }
448
449    #[test]
450    fn get_matches_for_terms_uses_matched_terms() {
451        let mut index = InMemoryIndex::default();
452        index.add_doc(INDEX, DOC_EN, "memoryIndexer", true);
453
454        let hits = index.search_hits(INDEX, "memryindexer");
455        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_EN) else {
456            panic!("expected hit for doc");
457        };
458        assert!(
459            hit.matched_terms
460                .iter()
461                .any(|t| t.term == "memoryindexer" && t.domain == TermDomain::Original),
462            "expected matched term memoryIndexer, got {:?}",
463            hit.matched_terms
464        );
465
466        let matches = index.get_matches_for_matched_terms(INDEX, DOC_EN, &hit.matched_terms);
467        assert!(!matches.is_empty(), "expected matches from matched_terms");
468    }
469
470    #[test]
471    fn snapshot_contains_aux_indices_per_domain() {
472        let mut index = InMemoryIndex::default();
473        index.add_doc(INDEX, DOC_CN, "你好世界 memory-indexer", true);
474
475        let snapshot = index
476            .get_snapshot_data(INDEX)
477            .expect("snapshot should exist");
478
479        let domains = snapshot.domains;
480        let original = domains
481            .get(&TermDomain::Original)
482            .expect("snapshot should contain original domain");
483        assert!(
484            !original.term_dict.is_empty(),
485            "expected original aux index to be persisted"
486        );
487        let pinyin_full = domains
488            .get(&TermDomain::PinyinFull)
489            .expect("snapshot should contain pinyin full domain");
490        assert!(
491            !pinyin_full.term_dict.is_empty(),
492            "expected full pinyin aux index to be persisted"
493        );
494        let pinyin_initials = domains
495            .get(&TermDomain::PinyinInitials)
496            .expect("snapshot should contain pinyin initials domain");
497        assert!(
498            !pinyin_initials.term_dict.is_empty(),
499            "expected initials pinyin aux index to be persisted"
500        );
501        assert!(
502            !pinyin_full.ngram_index.is_empty(),
503            "expected pinyin ngram index to be persisted"
504        );
505    }
506
507    #[test]
508    fn fullwidth_pinyin_query_hits() {
509        let mut index = InMemoryIndex::default();
510        index.add_doc(INDEX, DOC_CN, "你好世界", true);
511
512        // Full-width ASCII should normalize to ASCII and derive pinyin.
513        let hits = index.search_hits(INDEX, "NIHAO");
514        assert!(
515            hits.iter().any(|h| h.doc_id == DOC_CN),
516            "expected full-width pinyin query to hit, got {:?}",
517            hits.iter()
518                .map(|h| (&h.doc_id, &h.matched_terms))
519                .collect::<Vec<_>>()
520        );
521        let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
522            h.matched_terms
523                .iter()
524                .find(|t| t.domain == TermDomain::PinyinFull)
525        });
526        assert!(
527            matched.is_some(),
528            "expected matched pinyin full term, got {:?}",
529            hits.iter()
530                .find(|h| h.doc_id == DOC_CN)
531                .map(|h| h.matched_terms.clone())
532        );
533    }
534
535    #[test]
536    fn short_pinyin_fuzzy_hits() {
537        let mut index = InMemoryIndex::default();
538        index.add_doc(INDEX, DOC_CN, "你好", true);
539
540        // Missing one character should still fuzzy match via pinyin domain.
541        let hits = index.search_hits(INDEX, "niha");
542        assert!(
543            hits.iter().any(|h| h.doc_id == DOC_CN),
544            "expected fuzzy pinyin hit for short query, got {:?}",
545            hits.iter()
546                .map(|h| (&h.doc_id, &h.matched_terms))
547                .collect::<Vec<_>>()
548        );
549        let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
550            h.matched_terms.iter().find(|t| {
551                matches!(
552                    t.domain,
553                    TermDomain::PinyinFull | TermDomain::PinyinFullPrefix
554                )
555            })
556        });
557        assert!(
558            matched.is_some(),
559            "expected matched pinyin term, got {:?}",
560            hits.iter()
561                .find(|h| h.doc_id == DOC_CN)
562                .map(|h| h.matched_terms.clone())
563        );
564    }
565
566    #[test]
567    fn non_ascii_auto_fuzzy_fallback() {
568        let mut index = InMemoryIndex::default();
569        index.add_doc(INDEX, DOC_CN, "北京大学", true);
570
571        // Typo on the last character should still match via non-ASCII fuzzy fallback.
572        let hits = index.search_hits(INDEX, "北景大学");
573        assert!(
574            hits.iter().any(|h| h.doc_id == DOC_CN),
575            "expected non-ascii fuzzy fallback to hit, got {:?}",
576            hits.iter()
577                .map(|h| (&h.doc_id, &h.matched_terms))
578                .collect::<Vec<_>>()
579        );
580    }
581
582    #[test]
583    fn mixed_script_query_hits_all_tokens() {
584        let mut index = InMemoryIndex::default();
585        index.add_doc(INDEX, DOC_CN, "hello 世界", true);
586
587        let hits = index.search_hits(INDEX, "hello 世界");
588        assert!(
589            hits.iter().any(|h| h.doc_id == DOC_CN),
590            "expected mixed-script query to hit doc, got {:?}",
591            hits.iter()
592                .map(|h| (&h.doc_id, &h.matched_terms))
593                .collect::<Vec<_>>()
594        );
595        let matched = hits
596            .iter()
597            .find(|h| h.doc_id == DOC_CN)
598            .map(|h| h.matched_terms.clone())
599            .unwrap_or_default();
600        assert!(
601            matched
602                .iter()
603                .any(|t| t.term == "hello" && t.domain == TermDomain::Original),
604            "expected matched original term hello, got {:?}",
605            matched
606        );
607        assert!(
608            matched.iter().any(|t| t.term == "世界"),
609            "expected matched CJK term 世界, got {:?}",
610            matched
611        );
612    }
613
614    #[test]
615    fn chinese_oov_fuzzy_recall() {
616        let mut index = InMemoryIndex::default();
617        index.add_doc(INDEX, DOC_CN, "明博", true);
618
619        // Typo on the second char should still recall via non-ASCII fuzzy fallback.
620        let hits = index.search_hits(INDEX, "明搏");
621        assert!(
622            hits.iter().any(|h| h.doc_id == DOC_CN),
623            "expected OOV chinese fuzzy to hit, got {:?}",
624            hits.iter()
625                .map(|h| (&h.doc_id, &h.matched_terms))
626                .collect::<Vec<_>>()
627        );
628    }
629
630    #[test]
631    fn load_snapshot_rebuilds_missing_aux_indices() {
632        let mut index = InMemoryIndex::default();
633        index.add_doc(INDEX, DOC_CN, "你好世界", true);
634
635        let mut snapshot = index
636            .get_snapshot_data(INDEX)
637            .expect("snapshot should exist");
638        if let Some(full) = snapshot.domains.get_mut(&TermDomain::PinyinFull) {
639            full.term_dict.clear();
640            full.ngram_index.clear();
641        }
642        if let Some(initials) = snapshot.domains.get_mut(&TermDomain::PinyinInitials) {
643            initials.term_dict.clear();
644            initials.ngram_index.clear();
645        }
646
647        let mut restored = InMemoryIndex::default();
648        restored.load_snapshot(INDEX, snapshot);
649
650        let hits = restored.search_hits(INDEX, "nihap");
651        assert!(
652            hits.iter().any(|hit| hit.doc_id == DOC_CN),
653            "expected rebuilt pinyin aux indices to allow fuzzy hits"
654        );
655        assert!(
656            restored
657                .domains
658                .get(INDEX)
659                .and_then(|domains| domains.get(&TermDomain::PinyinFull))
660                .is_some_and(|d| !d.term_dict.is_empty()),
661            "expected pinyin full dictionary to be rebuilt from doc data"
662        );
663        assert!(
664            restored
665                .domains
666                .get(INDEX)
667                .and_then(|domains| domains.get(&TermDomain::PinyinInitials))
668                .is_some_and(|d| !d.term_dict.is_empty()),
669            "expected pinyin initials dictionary to be rebuilt from doc data"
670        );
671    }
672
673    #[test]
674    fn fuzzy_msm_filters_insufficient_matches() {
675        let mut index = InMemoryIndex::default();
676        index.add_doc(INDEX, "doc-long", "apple banana", true);
677        index.add_doc(INDEX, "doc-short", "apple", true);
678
679        let hits = index.search_with_mode_hits(INDEX, "applr banaan", SearchMode::Fuzzy);
680        assert!(
681            hits.iter().any(|h| h.doc_id == "doc-long"),
682            "expected fuzzy msm to keep doc with both terms, got {:?}",
683            hits
684        );
685        assert!(
686            hits.iter().all(|h| h.doc_id != "doc-short"),
687            "docs below min_should_match should be filtered out: {:?}",
688            hits
689        );
690    }
691
692    #[test]
693    fn short_cjk_fuzzy_recall_uses_2gram() {
694        let mut index = InMemoryIndex::default();
695        index.add_doc(INDEX, "doc-short-cjk", "方案", true);
696
697        let hits = index.search_hits(INDEX, "方桉");
698        assert!(
699            hits.iter().any(|h| h.doc_id == "doc-short-cjk"),
700            "expected 2-gram fuzzy recall for short CJK tokens, got {:?}",
701            hits
702        );
703    }
704
705    #[test]
706    fn snapshot_v2_rebuilds_derived_spans() {
707        let mut term_pos: HashMap<String, Vec<(u32, u32)>> = HashMap::new();
708        term_pos.insert("你好".to_string(), vec![(0, 6)]);
709        term_pos.insert("nihao".to_string(), vec![(0, 6)]);
710
711        let mut term_freqs: HashMap<String, TermFrequency> = HashMap::new();
712        let mut freq_original = TermFrequency::default();
713        freq_original.increment(TermDomain::Original);
714        term_freqs.insert("你好".to_string(), freq_original);
715        let mut freq_pinyin = TermFrequency::default();
716        freq_pinyin.increment(TermDomain::PinyinFull);
717        term_freqs.insert("nihao".to_string(), freq_pinyin);
718
719        let mut docs = HashMap::new();
720        docs.insert(
721            DOC_CN.to_string(),
722            DocData {
723                content: "你好".to_string(),
724                doc_len: 2,
725                term_pos,
726                term_freqs,
727                domain_doc_len: DomainLengths::default(),
728                derived_terms: HashMap::new(),
729            },
730        );
731
732        let snapshot = SnapshotData {
733            version: 2,
734            docs,
735            domains: HashMap::new(),
736        };
737
738        let mut index = InMemoryIndex::default();
739        index.load_snapshot(INDEX, snapshot);
740
741        let hits = index.search_hits(INDEX, "nihao");
742        assert!(
743            hits.iter().any(|h| h.doc_id == DOC_CN),
744            "expected legacy snapshot to rebuild pinyin hits, got {:?}",
745            hits
746        );
747
748        let matches = index.get_matches(INDEX, DOC_CN, "nihao");
749        assert!(
750            matches.iter().any(|(s, e)| (*s, *e) == (0, 2)),
751            "expected derived spans converted to utf16, got {:?}",
752            matches
753        );
754    }
755
756    #[test]
757    fn dictionary_load_and_fallback() {
758        let dir = tempdir().unwrap();
759        let path = dir.path().join("dict.json");
760
761        let mut entries = HashSet::new();
762        entries.insert("こんにちは".to_string());
763        let config = DictionaryConfig {
764            japanese: Some(ScriptDictionary {
765                version: Some("v1".to_string()),
766                entries,
767            }),
768            hangul: None,
769        };
770
771        std::fs::write(&path, serde_json::to_vec(&config).unwrap()).unwrap();
772        let loaded: DictionaryConfig =
773            serde_json::from_slice(&std::fs::read(&path).unwrap()).expect("should deserialize");
774
775        let mut index = InMemoryIndex::with_dictionary_config(loaded.clone());
776        index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
777
778        let hits = index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
779        assert!(
780            hits.iter().any(|h| h.doc_id == DOC_JP),
781            "expected dictionary-backed search hit, got {:?}",
782            hits
783        );
784        if let Some(dict) = domain_term_dict(&index, TermDomain::Original) {
785            assert!(
786                dict.contains("こんにちは"),
787                "expected dictionary tokens to be indexed, got {:?}",
788                dict
789            );
790        }
791
792        let mut fallback_index = InMemoryIndex::default();
793        fallback_index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
794        let fallback_hits =
795            fallback_index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
796        assert!(
797            fallback_hits.iter().any(|h| h.doc_id == DOC_JP),
798            "expected fallback tokenization to still recall doc, got {:?}",
799            fallback_hits
800        );
801    }
802}