memory_indexer/
lib.rs

1mod base;
2mod index;
3mod ngram;
4mod pipeline;
5mod search;
6mod tokenizer;
7mod types;
8
9pub use types::{
10    DocData, InMemoryIndex, PositionEncoding, SNAPSHOT_VERSION, SearchHit, SearchMode,
11    SnapshotData, TermDomain,
12};
13
14pub use tokenizer::dictionary::{
15    DictionaryConfig, DictionaryLanguage, DictionarySegmenter, ScriptDictionary,
16    train_dictionary_config,
17};
18
19#[cfg(test)]
20mod tests {
21    use super::types::MatchedTerm;
22    use super::*;
23    use std::collections::HashSet;
24    use tempfile::tempdir;
25
26    // #[test]
27    // #[ignore = "only use for local load/save testing"]
28    // fn load_index_snapshot() {
29    //     let ts = std::time::Instant::now();
30    //     if let Ok(decompressed) =
31    //         zstd::stream::decode_all(std::io::Cursor::new(&include_bytes!("../index.bin")))
32    //     {
33    //         println!(
34    //             "Decompression took {:?}, {}",
35    //             ts.elapsed(),
36    //             decompressed.len()
37    //         );
38    //         let ts = std::time::Instant::now();
39    //         let config = bincode::config::standard();
40    //         let snapshot: SnapshotData = bincode::serde::decode_from_slice(&decompressed, config)
41    //             .unwrap()
42    //             .0;
43    //         println!("Deserialization took {:?}", ts.elapsed());
44    //         println!(
45    //             "docs: {}, domains: {}, total_len: {}",
46    //             snapshot.docs.len(),
47    //             snapshot.domains.len(),
48    //             snapshot.total_len
49    //         );
50    //         let ts = std::time::Instant::now();
51    //         let mut index = InMemoryIndex::default();
52    //         index.load_snapshot("test-index", snapshot);
53    //         println!("Loading into index took {:?}", ts.elapsed());
54    //     }
55    // }
56
57    const INDEX: &str = "test-index";
58    const DOC_CN: &str = "doc-cn";
59    const DOC_EN: &str = "doc-en";
60    const DOC_JP: &str = "doc-jp";
61
62    fn assert_contains_doc(results: &[(String, f64)], doc_id: &str) {
63        assert!(
64            results.iter().any(|(id, _)| id == doc_id),
65            "expected results to contain doc {doc_id}, got {:?}",
66            results
67        );
68    }
69
70    #[test]
71    fn chinese_full_pinyin_search() {
72        let mut index = InMemoryIndex::default();
73        index.add_doc(INDEX, DOC_CN, "你好世界", true);
74
75        let hits = index.search(INDEX, "nihao");
76        assert_contains_doc(&hits, DOC_CN);
77    }
78
79    #[test]
80    fn chinese_initials_search() {
81        let mut index = InMemoryIndex::default();
82        index.add_doc(INDEX, DOC_CN, "你好世界", true);
83
84        let hits = index.search(INDEX, "nh");
85        assert_contains_doc(&hits, DOC_CN);
86    }
87
88    #[test]
89    fn chinese_initials_prefix_search() {
90        let mut index = InMemoryIndex::default();
91        index.add_doc(INDEX, DOC_CN, "你好世界", true);
92
93        let hits = index.search(INDEX, "nhs");
94        assert_contains_doc(&hits, DOC_CN);
95
96        let exact = index.get_matches(INDEX, DOC_CN, "nhsj");
97        assert!(!exact.is_empty());
98        let hit = index
99            .search_hits(INDEX, "nhs")
100            .into_iter()
101            .find(|h| h.doc_id == DOC_CN)
102            .expect("expected hit for prefix query");
103        let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
104        assert!(!prefix_matches.is_empty());
105        assert!(
106            prefix_matches
107                .iter()
108                .any(|p| exact.iter().any(|e| e.0 == p.0)),
109            "prefix highlight should align to original start"
110        );
111    }
112
113    #[test]
114    fn chinese_full_pinyin_prefix_search() {
115        let mut index = InMemoryIndex::default();
116        index.add_doc(INDEX, DOC_CN, "你好世界", true);
117
118        let hits = index.search(INDEX, "nih");
119        assert_contains_doc(&hits, DOC_CN);
120
121        let exact = index.get_matches(INDEX, DOC_CN, "nihaoshijie");
122        assert!(!exact.is_empty());
123        let hit = index
124            .search_hits(INDEX, "nih")
125            .into_iter()
126            .find(|h| h.doc_id == DOC_CN)
127            .expect("expected hit for prefix query");
128        let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
129        assert!(!prefix_matches.is_empty());
130        assert!(
131            prefix_matches
132                .iter()
133                .any(|p| exact.iter().any(|e| e.0 == p.0)),
134            "prefix highlight should align to original start"
135        );
136    }
137
138    #[test]
139    fn pinyin_fuzzy_search() {
140        let mut index = InMemoryIndex::default();
141        index.add_doc(INDEX, DOC_CN, "你好世界", true);
142
143        let hits = index.search_hits(INDEX, "nihap");
144        assert!(
145            hits.iter()
146                .any(|h| h.doc_id == DOC_CN && !h.matched_terms.is_empty()),
147            "expected matched pinyin term in fuzzy hits: {:?}",
148            hits.iter()
149                .map(|h| (&h.doc_id, &h.matched_terms))
150                .collect::<Vec<_>>()
151        );
152
153        let fuzzy_original = index.search_with_mode(INDEX, "nihap", SearchMode::Fuzzy);
154        assert!(
155            fuzzy_original.is_empty(),
156            "expected SearchMode::Fuzzy to only search original domain, got {:?}",
157            fuzzy_original
158        );
159    }
160
161    #[test]
162    fn english_fuzzy_search() {
163        let mut index = InMemoryIndex::default();
164        index.add_doc(INDEX, DOC_EN, "fuzzy search handles typos", true);
165
166        let hits = index.search_hits(INDEX, "fuzze");
167        assert!(hits.iter().any(|h| {
168            h.doc_id == DOC_EN
169                && h.matched_terms
170                    .iter()
171                    .any(|t| t.term == "fuzzy" && t.domain == TermDomain::Original)
172        }));
173    }
174
175    #[test]
176    fn english_query_splits_separators_and_lowercases() {
177        let mut index = InMemoryIndex::default();
178        index.add_doc(INDEX, DOC_EN, "MEMORY-INDEXER", true);
179
180        let hits = index.search_with_mode(INDEX, "memory-indexer", SearchMode::Exact);
181        assert_contains_doc(&hits, DOC_EN);
182    }
183
184    #[test]
185    fn fuzzy_search_allows_alphanumeric_terms() {
186        let mut index = InMemoryIndex::default();
187        index.add_doc(INDEX, DOC_EN, "version2 stable", true);
188
189        let hits = index.search_with_mode(INDEX, "versoin2", SearchMode::Fuzzy);
190        assert_contains_doc(&hits, DOC_EN);
191    }
192
193    #[test]
194    fn fuzzy_search_handles_separated_query_terms() {
195        let mut index = InMemoryIndex::default();
196        index.add_doc(INDEX, DOC_EN, "memory-indexer", true);
197
198        let hits = index.search_with_mode(INDEX, "memry-indexer", SearchMode::Fuzzy);
199        assert_contains_doc(&hits, DOC_EN);
200    }
201
202    #[test]
203    fn fuzzy_search_handles_short_terms() {
204        let mut index = InMemoryIndex::default();
205        index.add_doc(INDEX, DOC_EN, "go go", true);
206
207        let hits = index.search_with_mode(INDEX, "go", SearchMode::Fuzzy);
208        assert_contains_doc(&hits, DOC_EN);
209    }
210
211    #[test]
212    fn pinyin_highlight_uses_original_positions() {
213        let mut index = InMemoryIndex::default();
214        index.add_doc(INDEX, DOC_CN, "你好世界", true);
215
216        let direct = index.get_matches(INDEX, DOC_CN, "你好");
217        assert!(
218            !direct.is_empty(),
219            "expected direct chinese match to have positions"
220        );
221
222        let pinyin = index.get_matches(INDEX, DOC_CN, "nihao");
223        assert_eq!(pinyin, direct);
224    }
225
226    #[test]
227    fn highlight_prefers_original_for_mixed_scripts() {
228        let mut index = InMemoryIndex::default();
229        index.add_doc(INDEX, DOC_CN, "hello 世界", true);
230
231        let hits = index.search_hits(INDEX, "hello shi");
232        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
233            panic!("expected hit for mixed script query");
234        };
235        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
236        let content = index.get_doc(INDEX, DOC_CN).unwrap();
237        let slices: Vec<String> = matches
238            .iter()
239            .map(|(s, e)| utf16_slice(&content, *s, *e))
240            .collect();
241        assert!(
242            slices.iter().any(|s| s == "hello"),
243            "expected original spans for mixed script matches, got {:?}",
244            slices
245        );
246        if slices.iter().any(|s| s.chars().any(|c| !c.is_ascii())) {
247            assert!(
248                slices.iter().any(|s| s == "世界"),
249                "expected CJK spans for mixed script matches, got {:?}",
250                slices
251            );
252        }
253    }
254
255    #[test]
256    fn pinyin_prefix_highlight_uses_original_spans() {
257        let mut index = InMemoryIndex::default();
258        index.add_doc(INDEX, DOC_CN, "你好世界", true);
259
260        let hits = index.search_hits(INDEX, "nih");
261        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
262            panic!("expected prefix pinyin hit");
263        };
264        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
265        let direct = index.get_matches(INDEX, DOC_CN, "你好");
266        assert_eq!(
267            matches, direct,
268            "prefix highlight should map back to original spans"
269        );
270    }
271
272    #[test]
273    fn pinyin_highlight_handles_trailing_ascii() {
274        let mut index = InMemoryIndex::with_position_encoding(PositionEncoding::Utf16);
275        index.add_doc(
276            INDEX,
277            DOC_CN,
278            "美光将在全球内存供应短缺之际退出消费级内存业务",
279            true,
280        );
281
282        let hits = index.search_hits(INDEX, "neicun");
283        let hit = hits
284            .iter()
285            .find(|h| h.doc_id == DOC_CN)
286            .unwrap_or_else(|| panic!("expected hit for neicun, got {:?}", hits));
287        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
288        assert!(
289            !matches.is_empty(),
290            "expected highlight spans for pinyin match, got none"
291        );
292        let content = index.get_doc(INDEX, DOC_CN).unwrap();
293        let slices: Vec<String> = matches
294            .iter()
295            .map(|(s, e)| utf16_slice(&content, *s, *e))
296            .collect();
297        assert!(
298            slices.iter().all(|s| s == "内存"),
299            "expected highlights to stay on original term, got {:?}",
300            slices
301        );
302    }
303
304    fn utf16_slice(content: &str, start: u32, end: u32) -> String {
305        let mut utf16_pos = 0u32;
306        let mut start_byte = 0usize;
307        let mut end_byte = content.len();
308        for (idx, ch) in content.char_indices() {
309            if utf16_pos == start {
310                start_byte = idx;
311            }
312            utf16_pos += ch.len_utf16() as u32;
313            if utf16_pos == end {
314                end_byte = idx + ch.len_utf8();
315                break;
316            }
317        }
318        content[start_byte..end_byte].to_string()
319    }
320
321    #[test]
322    fn exact_search_prefers_original_terms() {
323        let mut index = InMemoryIndex::default();
324        index.add_doc(INDEX, DOC_EN, "nihao greeting", true);
325        index.add_doc(INDEX, DOC_CN, "你好世界", true);
326
327        let exact_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Exact);
328        assert_contains_doc(&exact_hits, DOC_EN);
329        assert!(
330            exact_hits.iter().all(|(id, _)| id == DOC_EN),
331            "expected exact search to ignore pinyin matches, got {:?}",
332            exact_hits
333        );
334
335        let auto_hits = index.search(INDEX, "nihao");
336        assert_contains_doc(&auto_hits, DOC_EN);
337        assert!(
338            auto_hits.iter().all(|(id, _)| id != DOC_CN),
339            "auto search should stop at exact matches"
340        );
341
342        let pinyin_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Pinyin);
343        assert_contains_doc(&pinyin_hits, DOC_CN);
344    }
345
346    #[test]
347    fn japanese_ngram_search() {
348        let mut index = InMemoryIndex::default();
349        index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
350
351        let hits = index.search(INDEX, "こん");
352        assert_contains_doc(&hits, DOC_JP);
353
354        let matches = index.get_matches(INDEX, DOC_JP, "こん");
355        assert!(
356            !matches.is_empty(),
357            "expected offsets for japanese ngram matches"
358        );
359    }
360
361    #[test]
362    fn kanji_adjacent_to_kana_skips_pinyin() {
363        let mut index = InMemoryIndex::default();
364        index.add_doc(INDEX, DOC_JP, "東京へようこそ", true);
365
366        let hits = index.search_with_mode(INDEX, "dongjing", SearchMode::Pinyin);
367        assert!(
368            hits.is_empty(),
369            "kanji near kana should not derive pinyin, got {:?}",
370            hits
371        );
372    }
373
374    #[test]
375    fn exact_search_applies_minimum_should_match() {
376        let mut index = InMemoryIndex::default();
377        index.add_doc(INDEX, "doc-2-terms", "apple banana", true);
378        index.add_doc(INDEX, "doc-3-terms", "apple banana cherry", true);
379        index.add_doc(INDEX, "doc-1-term", "apple", true);
380
381        let hits = index.search_with_mode(INDEX, "apple banana cherry", SearchMode::Exact);
382
383        assert_contains_doc(&hits, "doc-2-terms");
384        assert_contains_doc(&hits, "doc-3-terms");
385        assert!(
386            !hits.iter().any(|(id, _)| id == "doc-1-term"),
387            "docs below minimum_should_match should be filtered out"
388        );
389
390        let score_two = hits
391            .iter()
392            .find(|(id, _)| id == "doc-2-terms")
393            .map(|(_, s)| *s)
394            .unwrap();
395        let score_three = hits
396            .iter()
397            .find(|(id, _)| id == "doc-3-terms")
398            .map(|(_, s)| *s)
399            .unwrap();
400        assert!(
401            score_three > score_two,
402            "more matched terms should score higher: {} vs {}",
403            score_three,
404            score_two
405        );
406    }
407
408    #[test]
409    fn pinyin_polyphonic_variants_for_short_tokens() {
410        let mut index = InMemoryIndex::default();
411        index.add_doc(INDEX, DOC_CN, "重庆火锅", true);
412
413        let hits_zhong = index.search_with_mode_hits(INDEX, "zhongqing", SearchMode::Pinyin);
414        assert!(
415            hits_zhong.iter().any(|h| h.doc_id == DOC_CN),
416            "expected zhongqing variant to hit"
417        );
418
419        let hits_chong = index.search_with_mode_hits(INDEX, "chongqing", SearchMode::Pinyin);
420        assert!(
421            hits_chong.iter().any(|h| h.doc_id == DOC_CN),
422            "expected chongqing variant to hit"
423        );
424
425        let matched_terms: Vec<MatchedTerm> = hits_zhong
426            .into_iter()
427            .find(|h| h.doc_id == DOC_CN)
428            .map(|h| h.matched_terms)
429            .unwrap_or_default();
430        assert!(
431            matched_terms
432                .iter()
433                .any(|t| t.term.contains("zhongqing") || t.term.contains("chongqing")),
434            "expected polyphonic pinyin variants in matched_terms, got {:?}",
435            matched_terms
436        );
437    }
438
439    #[test]
440    fn get_matches_for_terms_uses_matched_terms() {
441        let mut index = InMemoryIndex::default();
442        index.add_doc(INDEX, DOC_EN, "memoryIndexer", true);
443
444        let hits = index.search_hits(INDEX, "memryindexer");
445        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_EN) else {
446            panic!("expected hit for doc");
447        };
448        assert!(
449            hit.matched_terms
450                .iter()
451                .any(|t| t.term == "memoryindexer" && t.domain == TermDomain::Original),
452            "expected matched term memoryIndexer, got {:?}",
453            hit.matched_terms
454        );
455
456        let matches = index.get_matches_for_matched_terms(INDEX, DOC_EN, &hit.matched_terms);
457        assert!(!matches.is_empty(), "expected matches from matched_terms");
458    }
459
460    #[test]
461    fn fullwidth_pinyin_query_hits() {
462        let mut index = InMemoryIndex::default();
463        index.add_doc(INDEX, DOC_CN, "你好世界", true);
464
465        // Full-width ASCII should normalize to ASCII and derive pinyin.
466        let hits = index.search_hits(INDEX, "NIHAO");
467        assert!(
468            hits.iter().any(|h| h.doc_id == DOC_CN),
469            "expected full-width pinyin query to hit, got {:?}",
470            hits.iter()
471                .map(|h| (&h.doc_id, &h.matched_terms))
472                .collect::<Vec<_>>()
473        );
474        let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
475            h.matched_terms
476                .iter()
477                .find(|t| t.domain == TermDomain::PinyinFull)
478        });
479        assert!(
480            matched.is_some(),
481            "expected matched pinyin full term, got {:?}",
482            hits.iter()
483                .find(|h| h.doc_id == DOC_CN)
484                .map(|h| h.matched_terms.clone())
485        );
486    }
487
488    #[test]
489    fn short_pinyin_fuzzy_hits() {
490        let mut index = InMemoryIndex::default();
491        index.add_doc(INDEX, DOC_CN, "你好", true);
492
493        // Missing one character should still fuzzy match via pinyin domain.
494        let hits = index.search_hits(INDEX, "niha");
495        assert!(
496            hits.iter().any(|h| h.doc_id == DOC_CN),
497            "expected fuzzy pinyin hit for short query, got {:?}",
498            hits.iter()
499                .map(|h| (&h.doc_id, &h.matched_terms))
500                .collect::<Vec<_>>()
501        );
502        let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
503            h.matched_terms
504                .iter()
505                .find(|t| matches!(t.domain, TermDomain::PinyinFull))
506        });
507        assert!(
508            matched.is_some(),
509            "expected matched pinyin term, got {:?}",
510            hits.iter()
511                .find(|h| h.doc_id == DOC_CN)
512                .map(|h| h.matched_terms.clone())
513        );
514    }
515
516    #[test]
517    fn non_ascii_auto_fuzzy_fallback() {
518        let mut index = InMemoryIndex::default();
519        index.add_doc(INDEX, DOC_CN, "北京大学", true);
520
521        // Typo on the last character should still match via non-ASCII fuzzy fallback.
522        let hits = index.search_hits(INDEX, "北景大学");
523        assert!(
524            hits.iter().any(|h| h.doc_id == DOC_CN),
525            "expected non-ascii fuzzy fallback to hit, got {:?}",
526            hits.iter()
527                .map(|h| (&h.doc_id, &h.matched_terms))
528                .collect::<Vec<_>>()
529        );
530    }
531
532    #[test]
533    fn mixed_script_query_hits_all_tokens() {
534        let mut index = InMemoryIndex::default();
535        index.add_doc(INDEX, DOC_CN, "hello 世界", true);
536
537        let hits = index.search_hits(INDEX, "hello 世界");
538        assert!(
539            hits.iter().any(|h| h.doc_id == DOC_CN),
540            "expected mixed-script query to hit doc, got {:?}",
541            hits.iter()
542                .map(|h| (&h.doc_id, &h.matched_terms))
543                .collect::<Vec<_>>()
544        );
545        let matched = hits
546            .iter()
547            .find(|h| h.doc_id == DOC_CN)
548            .map(|h| h.matched_terms.clone())
549            .unwrap_or_default();
550        assert!(
551            matched
552                .iter()
553                .any(|t| t.term == "hello" && t.domain == TermDomain::Original),
554            "expected matched original term hello, got {:?}",
555            matched
556        );
557        assert!(
558            matched.iter().any(|t| t.term == "世界"),
559            "expected matched CJK term 世界, got {:?}",
560            matched
561        );
562    }
563
564    #[test]
565    fn chinese_oov_fuzzy_recall() {
566        let mut index = InMemoryIndex::default();
567        index.add_doc(INDEX, DOC_CN, "明博", true);
568
569        // Typo on the second char should still recall via non-ASCII fuzzy fallback.
570        let hits = index.search_hits(INDEX, "明搏");
571        assert!(
572            hits.iter().any(|h| h.doc_id == DOC_CN),
573            "expected OOV chinese fuzzy to hit, got {:?}",
574            hits.iter()
575                .map(|h| (&h.doc_id, &h.matched_terms))
576                .collect::<Vec<_>>()
577        );
578    }
579
580    #[test]
581    fn load_snapshot_restores_domains_and_lengths() {
582        let mut index = InMemoryIndex::default();
583        index.add_doc(INDEX, DOC_CN, "你好世界", true);
584
585        let snapshot = index
586            .get_snapshot_data(INDEX)
587            .expect("snapshot should exist");
588        let expected_total_len = snapshot.total_len;
589        let expected_domain_len = snapshot.domain_total_len.get(TermDomain::Original);
590
591        let mut restored = InMemoryIndex::default();
592        restored.load_snapshot(INDEX, snapshot);
593
594        let hits = restored.search_hits(INDEX, "nihap");
595        assert!(
596            hits.iter().any(|hit| hit.doc_id == DOC_CN),
597            "expected restored index to serve pinyin fuzzy hits"
598        );
599        let restored_state = restored
600            .indexes
601            .get(INDEX)
602            .expect("restored index state should exist");
603        assert_eq!(restored_state.total_len, expected_total_len);
604        assert_eq!(
605            restored_state.domain_total_len.get(TermDomain::Original),
606            expected_domain_len
607        );
608    }
609
610    #[test]
611    fn has_unpersisted_changes_tracks_dirty_and_deleted() {
612        let mut index = InMemoryIndex::default();
613        assert!(!index.has_unpersisted_changes(None));
614
615        index.add_doc(INDEX, DOC_EN, "pending doc", true);
616        assert!(index.has_unpersisted_changes(Some(INDEX)));
617        assert!(index.has_unpersisted_changes(None));
618
619        index.take_dirty_and_deleted();
620        assert!(!index.has_unpersisted_changes(Some(INDEX)));
621        assert!(!index.has_unpersisted_changes(None));
622
623        index.remove_doc(INDEX, DOC_EN);
624        assert!(index.has_unpersisted_changes(Some(INDEX)));
625        assert!(index.has_unpersisted_changes(None));
626    }
627
628    #[test]
629    fn load_snapshot_clears_pending_flags() {
630        let mut index = InMemoryIndex::default();
631        index.add_doc(INDEX, DOC_EN, "snapshot doc", true);
632
633        let snapshot = index
634            .get_snapshot_data(INDEX)
635            .expect("snapshot should exist");
636        assert!(index.has_unpersisted_changes(Some(INDEX)));
637
638        index.load_snapshot(INDEX, snapshot);
639        assert!(
640            !index.has_unpersisted_changes(Some(INDEX)),
641            "loading a snapshot should reset pending persistence markers"
642        );
643    }
644
645    #[test]
646    fn persist_if_dirty_skips_when_clean() {
647        let mut index = InMemoryIndex::default();
648        let mut called = false;
649
650        let persisted = index
651            .persist_if_dirty(INDEX, |_snapshot| -> Result<(), ()> {
652                called = true;
653                Ok(())
654            })
655            .unwrap();
656
657        assert!(!persisted, "clean index should skip persistence");
658        assert!(!called, "callback should not run when skipped");
659    }
660
661    #[test]
662    fn persist_if_dirty_persists_and_marks_clean_on_success() {
663        let mut index = InMemoryIndex::default();
664        index.add_doc(INDEX, DOC_EN, "persist me", true);
665
666        let mut called = false;
667        let persisted = index
668            .persist_if_dirty(INDEX, |snapshot| -> Result<(), ()> {
669                called = true;
670                assert_eq!(snapshot.docs.len(), 1, "snapshot should include doc");
671                Ok(())
672            })
673            .unwrap();
674
675        assert!(persisted, "dirty index should persist");
676        assert!(called, "callback should run on persistence");
677        assert!(
678            !index.has_unpersisted_changes(Some(INDEX)),
679            "successful persist should mark index clean"
680        );
681    }
682
683    #[test]
684    fn persist_if_dirty_keeps_pending_on_error() {
685        let mut index = InMemoryIndex::default();
686        index.add_doc(INDEX, DOC_EN, "persist error", true);
687
688        let err = index
689            .persist_if_dirty(INDEX, |_snapshot| -> Result<(), &'static str> {
690                Err("boom")
691            })
692            .unwrap_err();
693        assert_eq!(err, "boom");
694        assert!(
695            index.has_unpersisted_changes(Some(INDEX)),
696            "failed persist should leave index dirty"
697        );
698    }
699
700    #[test]
701    fn fuzzy_msm_filters_insufficient_matches() {
702        let mut index = InMemoryIndex::default();
703        index.add_doc(INDEX, "doc-long", "apple banana", true);
704        index.add_doc(INDEX, "doc-short", "apple", true);
705
706        let hits = index.search_with_mode_hits(INDEX, "applr banaan", SearchMode::Fuzzy);
707        assert!(
708            hits.iter().any(|h| h.doc_id == "doc-long"),
709            "expected fuzzy msm to keep doc with both terms, got {:?}",
710            hits
711        );
712        assert!(
713            hits.iter().all(|h| h.doc_id != "doc-short"),
714            "docs below min_should_match should be filtered out: {:?}",
715            hits
716        );
717    }
718
719    #[test]
720    fn short_cjk_fuzzy_recall_uses_2gram() {
721        let mut index = InMemoryIndex::default();
722        index.add_doc(INDEX, "doc-short-cjk", "方案", true);
723
724        let hits = index.search_hits(INDEX, "方桉");
725        assert!(
726            hits.iter().any(|h| h.doc_id == "doc-short-cjk"),
727            "expected 2-gram fuzzy recall for short CJK tokens, got {:?}",
728            hits
729        );
730    }
731
732    #[test]
733    fn dictionary_load_and_fallback() {
734        let dir = tempdir().unwrap();
735        let path = dir.path().join("dict.json");
736
737        let mut entries = HashSet::new();
738        entries.insert("こんにちは".to_string());
739        let config = DictionaryConfig {
740            japanese: Some(ScriptDictionary {
741                version: Some("v1".to_string()),
742                entries,
743            }),
744            hangul: None,
745        };
746
747        std::fs::write(&path, serde_json::to_vec(&config).unwrap()).unwrap();
748        let loaded: DictionaryConfig =
749            serde_json::from_slice(&std::fs::read(&path).unwrap()).expect("should deserialize");
750
751        let mut index = InMemoryIndex::with_dictionary_config(loaded.clone());
752        index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
753
754        let hits = index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
755        assert!(
756            hits.iter().any(|h| h.doc_id == DOC_JP),
757            "expected dictionary-backed search hit, got {:?}",
758            hits
759        );
760        let mut fallback_index = InMemoryIndex::default();
761        fallback_index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
762        let fallback_hits =
763            fallback_index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
764        assert!(
765            fallback_hits.iter().any(|h| h.doc_id == DOC_JP),
766            "expected fallback tokenization to still recall doc, got {:?}",
767            fallback_hits
768        );
769    }
770
771    #[test]
772    fn id_like_tokens_match_exact() {
773        let mut index = InMemoryIndex::default();
774        let doc_id = "doc-id";
775        let id_like = "IKPeA9Zu9eo_pXlKWVFcf";
776
777        index.add_doc(INDEX, doc_id, id_like, true);
778
779        let hits = index.search_with_mode_hits(INDEX, id_like, SearchMode::Exact);
780        assert!(
781            hits.iter().any(|h| h.doc_id == doc_id),
782            "expected exact search to hit id-like token, got {:?}",
783            hits
784        );
785    }
786}