Skip to main content

memory_indexer/
lib.rs

1mod base;
2mod index;
3mod ngram;
4mod pipeline;
5mod search;
6mod tokenizer;
7mod types;
8
9pub use types::{
10    DocData, InMemoryIndex, PositionEncoding, SNAPSHOT_VERSION, SearchHit, SearchMode,
11    SnapshotData, TermDomain,
12};
13
14pub use tokenizer::dictionary::{
15    DictionaryConfig, DictionaryLanguage, DictionarySegmenter, ScriptDictionary,
16    train_dictionary_config,
17};
18
19#[cfg(test)]
20mod tests {
21    use super::types::MatchedTerm;
22    use super::*;
23    use std::collections::HashSet;
24    use tempfile::tempdir;
25
26    // #[test]
27    // #[ignore = "only use for local load/save testing"]
28    // fn load_index_snapshot() {
29    //     let ts = std::time::Instant::now();
30    //     if let Ok(decompressed) =
31    //         zstd::stream::decode_all(std::io::Cursor::new(&include_bytes!("../index.bin")))
32    //     {
33    //         println!(
34    //             "Decompression took {:?}, {}",
35    //             ts.elapsed(),
36    //             decompressed.len()
37    //         );
38    //         let ts = std::time::Instant::now();
39    //         let config = bincode::config::standard();
40    //         let snapshot: SnapshotData = bincode::serde::decode_from_slice(&decompressed, config)
41    //             .unwrap()
42    //             .0;
43    //         println!("Deserialization took {:?}", ts.elapsed());
44    //         println!(
45    //             "docs: {}, domains: {}, total_len: {}",
46    //             snapshot.docs.len(),
47    //             snapshot.domains.len(),
48    //             snapshot.total_len
49    //         );
50    //         let ts = std::time::Instant::now();
51    //         let mut index = InMemoryIndex::default();
52    //         index.load_snapshot("test-index", snapshot);
53    //         println!("Loading into index took {:?}", ts.elapsed());
54    //     }
55    // }
56
57    const INDEX: &str = "test-index";
58    const DOC_CN: &str = "doc-cn";
59    const DOC_EN: &str = "doc-en";
60    const DOC_JP: &str = "doc-jp";
61
62    fn assert_contains_doc(results: &[(String, f64)], doc_id: &str) {
63        assert!(
64            results.iter().any(|(id, _)| id == doc_id),
65            "expected results to contain doc {doc_id}, got {:?}",
66            results
67        );
68    }
69
70    #[test]
71    fn chinese_full_pinyin_search() {
72        let mut index = InMemoryIndex::default();
73        index.add_doc(INDEX, DOC_CN, "你好世界", true);
74
75        let hits = index.search(INDEX, "nihao");
76        assert_contains_doc(&hits, DOC_CN);
77    }
78
79    #[test]
80    fn chinese_initials_search() {
81        let mut index = InMemoryIndex::default();
82        index.add_doc(INDEX, DOC_CN, "你好世界", true);
83
84        let hits = index.search(INDEX, "nh");
85        assert_contains_doc(&hits, DOC_CN);
86    }
87
88    #[test]
89    fn chinese_initials_prefix_search() {
90        let mut index = InMemoryIndex::default();
91        index.add_doc(INDEX, DOC_CN, "你好世界", true);
92
93        let hits = index.search(INDEX, "nhs");
94        assert_contains_doc(&hits, DOC_CN);
95
96        let exact = index.get_matches(INDEX, DOC_CN, "nhsj");
97        assert!(!exact.is_empty());
98        let hit = index
99            .search_hits(INDEX, "nhs")
100            .into_iter()
101            .find(|h| h.doc_id == DOC_CN)
102            .expect("expected hit for prefix query");
103        let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
104        assert!(!prefix_matches.is_empty());
105        assert!(
106            prefix_matches
107                .iter()
108                .any(|p| exact.iter().any(|e| e.0 == p.0)),
109            "prefix highlight should align to original start"
110        );
111    }
112
113    #[test]
114    fn chinese_full_pinyin_prefix_search() {
115        let mut index = InMemoryIndex::default();
116        index.add_doc(INDEX, DOC_CN, "你好世界", true);
117
118        let hits = index.search(INDEX, "nih");
119        assert_contains_doc(&hits, DOC_CN);
120
121        let exact = index.get_matches(INDEX, DOC_CN, "nihaoshijie");
122        assert!(!exact.is_empty());
123        let hit = index
124            .search_hits(INDEX, "nih")
125            .into_iter()
126            .find(|h| h.doc_id == DOC_CN)
127            .expect("expected hit for prefix query");
128        let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
129        assert!(!prefix_matches.is_empty());
130        assert!(
131            prefix_matches
132                .iter()
133                .any(|p| exact.iter().any(|e| e.0 == p.0)),
134            "prefix highlight should align to original start"
135        );
136    }
137
138    #[test]
139    fn pinyin_fuzzy_search() {
140        let mut index = InMemoryIndex::default();
141        index.add_doc(INDEX, DOC_CN, "你好世界", true);
142
143        let hits = index.search_hits(INDEX, "nihap");
144        assert!(
145            hits.iter()
146                .any(|h| h.doc_id == DOC_CN && !h.matched_terms.is_empty()),
147            "expected matched pinyin term in fuzzy hits: {:?}",
148            hits.iter()
149                .map(|h| (&h.doc_id, &h.matched_terms))
150                .collect::<Vec<_>>()
151        );
152
153        let fuzzy_original = index.search_with_mode(INDEX, "nihap", SearchMode::Fuzzy);
154        assert!(
155            fuzzy_original.is_empty(),
156            "expected SearchMode::Fuzzy to only search original domain, got {:?}",
157            fuzzy_original
158        );
159    }
160
161    #[test]
162    fn english_fuzzy_search() {
163        let mut index = InMemoryIndex::default();
164        index.add_doc(INDEX, DOC_EN, "fuzzy search handles typos", true);
165
166        let hits = index.search_hits(INDEX, "fuzze");
167        assert!(hits.iter().any(|h| {
168            h.doc_id == DOC_EN
169                && h.matched_terms
170                    .iter()
171                    .any(|t| t.term == "fuzzy" && t.domain == TermDomain::Original)
172        }));
173    }
174
175    #[test]
176    fn english_query_splits_separators_and_lowercases() {
177        let mut index = InMemoryIndex::default();
178        index.add_doc(INDEX, DOC_EN, "MEMORY-INDEXER", true);
179
180        let hits = index.search_with_mode(INDEX, "memory-indexer", SearchMode::Exact);
181        assert_contains_doc(&hits, DOC_EN);
182    }
183
184    #[test]
185    fn cyrillic_term_matches_inside_phrase() {
186        let mut index = InMemoryIndex::default();
187        let doc_id = "doc-ru";
188        index.add_doc(INDEX, doc_id, "привет мир", true);
189
190        let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
191        assert_contains_doc(&hits, doc_id);
192    }
193
194    #[test]
195    fn greek_term_matches_inside_phrase() {
196        let mut index = InMemoryIndex::default();
197        let doc_id = "doc-gr";
198        index.add_doc(INDEX, doc_id, "γειά σου κόσμε", true);
199
200        let hits = index.search_with_mode(INDEX, "γειά", SearchMode::Exact);
201        assert_contains_doc(&hits, doc_id);
202    }
203
204    #[test]
205    fn cyrillic_term_matches_with_punctuation() {
206        let mut index = InMemoryIndex::default();
207        let doc_id = "doc-ru-punct";
208        index.add_doc(INDEX, doc_id, "привет, привет", true);
209
210        let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
211        assert_contains_doc(&hits, doc_id);
212    }
213
214    #[test]
215    fn armenian_term_matches_with_punctuation() {
216        let mut index = InMemoryIndex::default();
217        let doc_id = "doc-hy-punct";
218        index.add_doc(INDEX, doc_id, "բարեւ, աշխարհ", true);
219
220        let hits = index.search_with_mode(INDEX, "բարեւ", SearchMode::Exact);
221        assert_contains_doc(&hits, doc_id);
222    }
223
224    #[test]
225    fn fuzzy_search_allows_alphanumeric_terms() {
226        let mut index = InMemoryIndex::default();
227        index.add_doc(INDEX, DOC_EN, "version2 stable", true);
228
229        let hits = index.search_with_mode(INDEX, "versoin2", SearchMode::Fuzzy);
230        assert_contains_doc(&hits, DOC_EN);
231    }
232
233    #[test]
234    fn fuzzy_search_handles_separated_query_terms() {
235        let mut index = InMemoryIndex::default();
236        index.add_doc(INDEX, DOC_EN, "memory-indexer", true);
237
238        let hits = index.search_with_mode(INDEX, "memry-indexer", SearchMode::Fuzzy);
239        assert_contains_doc(&hits, DOC_EN);
240    }
241
242    #[test]
243    fn fuzzy_search_handles_short_terms() {
244        let mut index = InMemoryIndex::default();
245        index.add_doc(INDEX, DOC_EN, "go go", true);
246
247        let hits = index.search_with_mode(INDEX, "go", SearchMode::Fuzzy);
248        assert_contains_doc(&hits, DOC_EN);
249    }
250
251    #[test]
252    fn pinyin_highlight_uses_original_positions() {
253        let mut index = InMemoryIndex::default();
254        index.add_doc(INDEX, DOC_CN, "你好世界", true);
255
256        let direct = index.get_matches(INDEX, DOC_CN, "你好");
257        assert!(
258            !direct.is_empty(),
259            "expected direct chinese match to have positions"
260        );
261
262        let pinyin = index.get_matches(INDEX, DOC_CN, "nihao");
263        assert_eq!(pinyin, direct);
264    }
265
266    #[test]
267    fn highlight_prefers_original_for_mixed_scripts() {
268        let mut index = InMemoryIndex::default();
269        index.add_doc(INDEX, DOC_CN, "hello 世界", true);
270
271        let hits = index.search_hits(INDEX, "hello shi");
272        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
273            panic!("expected hit for mixed script query");
274        };
275        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
276        let content = index.get_doc(INDEX, DOC_CN).unwrap();
277        let slices: Vec<String> = matches
278            .iter()
279            .map(|(s, e)| utf16_slice(&content, *s, *e))
280            .collect();
281        assert!(
282            slices.iter().any(|s| s == "hello"),
283            "expected original spans for mixed script matches, got {:?}",
284            slices
285        );
286        if slices.iter().any(|s| s.chars().any(|c| !c.is_ascii())) {
287            assert!(
288                slices.iter().any(|s| s == "世界"),
289                "expected CJK spans for mixed script matches, got {:?}",
290                slices
291            );
292        }
293    }
294
295    #[test]
296    fn pinyin_prefix_highlight_uses_original_spans() {
297        let mut index = InMemoryIndex::default();
298        index.add_doc(INDEX, DOC_CN, "你好世界", true);
299
300        let hits = index.search_hits(INDEX, "nih");
301        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
302            panic!("expected prefix pinyin hit");
303        };
304        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
305        let direct = index.get_matches(INDEX, DOC_CN, "你好");
306        assert_eq!(
307            matches, direct,
308            "prefix highlight should map back to original spans"
309        );
310    }
311
312    #[test]
313    fn pinyin_highlight_handles_trailing_ascii() {
314        let mut index = InMemoryIndex::with_position_encoding(PositionEncoding::Utf16);
315        index.add_doc(
316            INDEX,
317            DOC_CN,
318            "美光将在全球内存供应短缺之际退出消费级内存业务",
319            true,
320        );
321
322        let hits = index.search_hits(INDEX, "neicun");
323        let hit = hits
324            .iter()
325            .find(|h| h.doc_id == DOC_CN)
326            .unwrap_or_else(|| panic!("expected hit for neicun, got {:?}", hits));
327        let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
328        assert!(
329            !matches.is_empty(),
330            "expected highlight spans for pinyin match, got none"
331        );
332        let content = index.get_doc(INDEX, DOC_CN).unwrap();
333        let slices: Vec<String> = matches
334            .iter()
335            .map(|(s, e)| utf16_slice(&content, *s, *e))
336            .collect();
337        assert!(
338            slices.iter().all(|s| s == "内存"),
339            "expected highlights to stay on original term, got {:?}",
340            slices
341        );
342    }
343
344    fn utf16_slice(content: &str, start: u32, end: u32) -> String {
345        let mut utf16_pos = 0u32;
346        let mut start_byte = 0usize;
347        let mut end_byte = content.len();
348        for (idx, ch) in content.char_indices() {
349            if utf16_pos == start {
350                start_byte = idx;
351            }
352            utf16_pos += ch.len_utf16() as u32;
353            if utf16_pos == end {
354                end_byte = idx + ch.len_utf8();
355                break;
356            }
357        }
358        content[start_byte..end_byte].to_string()
359    }
360
361    #[test]
362    fn exact_search_prefers_original_terms() {
363        let mut index = InMemoryIndex::default();
364        index.add_doc(INDEX, DOC_EN, "nihao greeting", true);
365        index.add_doc(INDEX, DOC_CN, "你好世界", true);
366
367        let exact_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Exact);
368        assert_contains_doc(&exact_hits, DOC_EN);
369        assert!(
370            exact_hits.iter().all(|(id, _)| id == DOC_EN),
371            "expected exact search to ignore pinyin matches, got {:?}",
372            exact_hits
373        );
374
375        let auto_hits = index.search(INDEX, "nihao");
376        assert_contains_doc(&auto_hits, DOC_EN);
377        assert!(
378            auto_hits.iter().all(|(id, _)| id != DOC_CN),
379            "auto search should stop at exact matches"
380        );
381
382        let pinyin_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Pinyin);
383        assert_contains_doc(&pinyin_hits, DOC_CN);
384    }
385
386    #[test]
387    fn japanese_ngram_search() {
388        let mut index = InMemoryIndex::default();
389        index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
390
391        let hits = index.search(INDEX, "こん");
392        assert_contains_doc(&hits, DOC_JP);
393
394        let matches = index.get_matches(INDEX, DOC_JP, "こん");
395        assert!(
396            !matches.is_empty(),
397            "expected offsets for japanese ngram matches"
398        );
399    }
400
401    #[test]
402    fn kanji_adjacent_to_kana_skips_pinyin() {
403        let mut index = InMemoryIndex::default();
404        index.add_doc(INDEX, DOC_JP, "東京へようこそ", true);
405
406        let hits = index.search_with_mode(INDEX, "dongjing", SearchMode::Pinyin);
407        assert!(
408            hits.is_empty(),
409            "kanji near kana should not derive pinyin, got {:?}",
410            hits
411        );
412    }
413
414    #[test]
415    fn exact_search_applies_minimum_should_match() {
416        let mut index = InMemoryIndex::default();
417        index.add_doc(INDEX, "doc-2-terms", "apple banana", true);
418        index.add_doc(INDEX, "doc-3-terms", "apple banana cherry", true);
419        index.add_doc(INDEX, "doc-1-term", "apple", true);
420
421        let hits = index.search_with_mode(INDEX, "apple banana cherry", SearchMode::Exact);
422
423        assert_contains_doc(&hits, "doc-2-terms");
424        assert_contains_doc(&hits, "doc-3-terms");
425        assert!(
426            !hits.iter().any(|(id, _)| id == "doc-1-term"),
427            "docs below minimum_should_match should be filtered out"
428        );
429
430        let score_two = hits
431            .iter()
432            .find(|(id, _)| id == "doc-2-terms")
433            .map(|(_, s)| *s)
434            .unwrap();
435        let score_three = hits
436            .iter()
437            .find(|(id, _)| id == "doc-3-terms")
438            .map(|(_, s)| *s)
439            .unwrap();
440        assert!(
441            score_three > score_two,
442            "more matched terms should score higher: {} vs {}",
443            score_three,
444            score_two
445        );
446    }
447
448    #[test]
449    fn pinyin_polyphonic_variants_for_short_tokens() {
450        let mut index = InMemoryIndex::default();
451        index.add_doc(INDEX, DOC_CN, "重庆火锅", true);
452
453        let hits_zhong = index.search_with_mode_hits(INDEX, "zhongqing", SearchMode::Pinyin);
454        assert!(
455            hits_zhong.iter().any(|h| h.doc_id == DOC_CN),
456            "expected zhongqing variant to hit"
457        );
458
459        let hits_chong = index.search_with_mode_hits(INDEX, "chongqing", SearchMode::Pinyin);
460        assert!(
461            hits_chong.iter().any(|h| h.doc_id == DOC_CN),
462            "expected chongqing variant to hit"
463        );
464
465        let matched_terms: Vec<MatchedTerm> = hits_zhong
466            .into_iter()
467            .find(|h| h.doc_id == DOC_CN)
468            .map(|h| h.matched_terms)
469            .unwrap_or_default();
470        assert!(
471            matched_terms
472                .iter()
473                .any(|t| t.term.contains("zhongqing") || t.term.contains("chongqing")),
474            "expected polyphonic pinyin variants in matched_terms, got {:?}",
475            matched_terms
476        );
477    }
478
479    #[test]
480    fn get_matches_for_terms_uses_matched_terms() {
481        let mut index = InMemoryIndex::default();
482        index.add_doc(INDEX, DOC_EN, "memoryIndexer", true);
483
484        let hits = index.search_hits(INDEX, "memryindexer");
485        let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_EN) else {
486            panic!("expected hit for doc");
487        };
488        assert!(
489            hit.matched_terms
490                .iter()
491                .any(|t| t.term == "memoryindexer" && t.domain == TermDomain::Original),
492            "expected matched term memoryIndexer, got {:?}",
493            hit.matched_terms
494        );
495
496        let matches = index.get_matches_for_matched_terms(INDEX, DOC_EN, &hit.matched_terms);
497        assert!(!matches.is_empty(), "expected matches from matched_terms");
498    }
499
500    #[test]
501    fn fullwidth_pinyin_query_hits() {
502        let mut index = InMemoryIndex::default();
503        index.add_doc(INDEX, DOC_CN, "你好世界", true);
504
505        // Full-width ASCII should normalize to ASCII and derive pinyin.
506        let hits = index.search_hits(INDEX, "NIHAO");
507        assert!(
508            hits.iter().any(|h| h.doc_id == DOC_CN),
509            "expected full-width pinyin query to hit, got {:?}",
510            hits.iter()
511                .map(|h| (&h.doc_id, &h.matched_terms))
512                .collect::<Vec<_>>()
513        );
514        let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
515            h.matched_terms
516                .iter()
517                .find(|t| t.domain == TermDomain::PinyinFull)
518        });
519        assert!(
520            matched.is_some(),
521            "expected matched pinyin full term, got {:?}",
522            hits.iter()
523                .find(|h| h.doc_id == DOC_CN)
524                .map(|h| h.matched_terms.clone())
525        );
526    }
527
528    #[test]
529    fn short_pinyin_fuzzy_hits() {
530        let mut index = InMemoryIndex::default();
531        index.add_doc(INDEX, DOC_CN, "你好", true);
532
533        // Missing one character should still fuzzy match via pinyin domain.
534        let hits = index.search_hits(INDEX, "niha");
535        assert!(
536            hits.iter().any(|h| h.doc_id == DOC_CN),
537            "expected fuzzy pinyin hit for short query, got {:?}",
538            hits.iter()
539                .map(|h| (&h.doc_id, &h.matched_terms))
540                .collect::<Vec<_>>()
541        );
542        let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
543            h.matched_terms
544                .iter()
545                .find(|t| matches!(t.domain, TermDomain::PinyinFull))
546        });
547        assert!(
548            matched.is_some(),
549            "expected matched pinyin term, got {:?}",
550            hits.iter()
551                .find(|h| h.doc_id == DOC_CN)
552                .map(|h| h.matched_terms.clone())
553        );
554    }
555
556    #[test]
557    fn non_ascii_auto_fuzzy_fallback() {
558        let mut index = InMemoryIndex::default();
559        index.add_doc(INDEX, DOC_CN, "北京大学", true);
560
561        // Typo on the last character should still match via non-ASCII fuzzy fallback.
562        let hits = index.search_hits(INDEX, "北景大学");
563        assert!(
564            hits.iter().any(|h| h.doc_id == DOC_CN),
565            "expected non-ascii fuzzy fallback to hit, got {:?}",
566            hits.iter()
567                .map(|h| (&h.doc_id, &h.matched_terms))
568                .collect::<Vec<_>>()
569        );
570    }
571
572    #[test]
573    fn mixed_script_query_hits_all_tokens() {
574        let mut index = InMemoryIndex::default();
575        index.add_doc(INDEX, DOC_CN, "hello 世界", true);
576
577        let hits = index.search_hits(INDEX, "hello 世界");
578        assert!(
579            hits.iter().any(|h| h.doc_id == DOC_CN),
580            "expected mixed-script query to hit doc, got {:?}",
581            hits.iter()
582                .map(|h| (&h.doc_id, &h.matched_terms))
583                .collect::<Vec<_>>()
584        );
585        let matched = hits
586            .iter()
587            .find(|h| h.doc_id == DOC_CN)
588            .map(|h| h.matched_terms.clone())
589            .unwrap_or_default();
590        assert!(
591            matched
592                .iter()
593                .any(|t| t.term == "hello" && t.domain == TermDomain::Original),
594            "expected matched original term hello, got {:?}",
595            matched
596        );
597        assert!(
598            matched.iter().any(|t| t.term == "世界"),
599            "expected matched CJK term 世界, got {:?}",
600            matched
601        );
602    }
603
604    #[test]
605    fn chinese_oov_fuzzy_recall() {
606        let mut index = InMemoryIndex::default();
607        index.add_doc(INDEX, DOC_CN, "明博", true);
608
609        // Typo on the second char should still recall via non-ASCII fuzzy fallback.
610        let hits = index.search_hits(INDEX, "明搏");
611        assert!(
612            hits.iter().any(|h| h.doc_id == DOC_CN),
613            "expected OOV chinese fuzzy to hit, got {:?}",
614            hits.iter()
615                .map(|h| (&h.doc_id, &h.matched_terms))
616                .collect::<Vec<_>>()
617        );
618    }
619
620    #[test]
621    fn load_snapshot_restores_domains_and_lengths() {
622        let mut index = InMemoryIndex::default();
623        index.add_doc(INDEX, DOC_CN, "你好世界", true);
624
625        let snapshot = index
626            .get_snapshot_data(INDEX)
627            .expect("snapshot should exist");
628        let expected_total_len = snapshot.total_len;
629        let expected_domain_len = snapshot.domain_total_len.get(TermDomain::Original);
630
631        let mut restored = InMemoryIndex::default();
632        restored.load_snapshot(INDEX, snapshot);
633
634        let hits = restored.search_hits(INDEX, "nihap");
635        assert!(
636            hits.iter().any(|hit| hit.doc_id == DOC_CN),
637            "expected restored index to serve pinyin fuzzy hits"
638        );
639        let restored_state = restored
640            .indexes
641            .get(INDEX)
642            .expect("restored index state should exist");
643        assert_eq!(restored_state.total_len, expected_total_len);
644        assert_eq!(
645            restored_state.domain_total_len.get(TermDomain::Original),
646            expected_domain_len
647        );
648    }
649
650    #[test]
651    fn has_unpersisted_changes_tracks_dirty_and_deleted() {
652        let mut index = InMemoryIndex::default();
653        assert!(!index.has_unpersisted_changes(None));
654
655        index.add_doc(INDEX, DOC_EN, "pending doc", true);
656        assert!(index.has_unpersisted_changes(Some(INDEX)));
657        assert!(index.has_unpersisted_changes(None));
658
659        index.take_dirty_and_deleted();
660        assert!(!index.has_unpersisted_changes(Some(INDEX)));
661        assert!(!index.has_unpersisted_changes(None));
662
663        index.remove_doc(INDEX, DOC_EN);
664        assert!(index.has_unpersisted_changes(Some(INDEX)));
665        assert!(index.has_unpersisted_changes(None));
666    }
667
668    #[test]
669    fn load_snapshot_clears_pending_flags() {
670        let mut index = InMemoryIndex::default();
671        index.add_doc(INDEX, DOC_EN, "snapshot doc", true);
672
673        let snapshot = index
674            .get_snapshot_data(INDEX)
675            .expect("snapshot should exist");
676        assert!(index.has_unpersisted_changes(Some(INDEX)));
677
678        index.load_snapshot(INDEX, snapshot);
679        assert!(
680            !index.has_unpersisted_changes(Some(INDEX)),
681            "loading a snapshot should reset pending persistence markers"
682        );
683    }
684
685    #[test]
686    fn persist_if_dirty_skips_when_clean() {
687        let mut index = InMemoryIndex::default();
688        let mut called = false;
689
690        let persisted = index
691            .persist_if_dirty(INDEX, |_snapshot| -> Result<(), ()> {
692                called = true;
693                Ok(())
694            })
695            .unwrap();
696
697        assert!(!persisted, "clean index should skip persistence");
698        assert!(!called, "callback should not run when skipped");
699    }
700
701    #[test]
702    fn persist_if_dirty_persists_and_marks_clean_on_success() {
703        let mut index = InMemoryIndex::default();
704        index.add_doc(INDEX, DOC_EN, "persist me", true);
705
706        let mut called = false;
707        let persisted = index
708            .persist_if_dirty(INDEX, |snapshot| -> Result<(), ()> {
709                called = true;
710                assert_eq!(snapshot.docs.len(), 1, "snapshot should include doc");
711                Ok(())
712            })
713            .unwrap();
714
715        assert!(persisted, "dirty index should persist");
716        assert!(called, "callback should run on persistence");
717        assert!(
718            !index.has_unpersisted_changes(Some(INDEX)),
719            "successful persist should mark index clean"
720        );
721    }
722
723    #[test]
724    fn persist_if_dirty_keeps_pending_on_error() {
725        let mut index = InMemoryIndex::default();
726        index.add_doc(INDEX, DOC_EN, "persist error", true);
727
728        let err = index
729            .persist_if_dirty(INDEX, |_snapshot| -> Result<(), &'static str> {
730                Err("boom")
731            })
732            .unwrap_err();
733        assert_eq!(err, "boom");
734        assert!(
735            index.has_unpersisted_changes(Some(INDEX)),
736            "failed persist should leave index dirty"
737        );
738    }
739
740    #[test]
741    fn fuzzy_msm_filters_insufficient_matches() {
742        let mut index = InMemoryIndex::default();
743        index.add_doc(INDEX, "doc-long", "apple banana", true);
744        index.add_doc(INDEX, "doc-short", "apple", true);
745
746        let hits = index.search_with_mode_hits(INDEX, "applr banaan", SearchMode::Fuzzy);
747        assert!(
748            hits.iter().any(|h| h.doc_id == "doc-long"),
749            "expected fuzzy msm to keep doc with both terms, got {:?}",
750            hits
751        );
752        assert!(
753            hits.iter().all(|h| h.doc_id != "doc-short"),
754            "docs below min_should_match should be filtered out: {:?}",
755            hits
756        );
757    }
758
759    #[test]
760    fn short_cjk_fuzzy_recall_uses_2gram() {
761        let mut index = InMemoryIndex::default();
762        index.add_doc(INDEX, "doc-short-cjk", "方案", true);
763
764        let hits = index.search_hits(INDEX, "方桉");
765        assert!(
766            hits.iter().any(|h| h.doc_id == "doc-short-cjk"),
767            "expected 2-gram fuzzy recall for short CJK tokens, got {:?}",
768            hits
769        );
770    }
771
772    #[test]
773    fn dictionary_load_and_fallback() {
774        let dir = tempdir().unwrap();
775        let path = dir.path().join("dict.json");
776
777        let mut entries = HashSet::new();
778        entries.insert("こんにちは".to_string());
779        let config = DictionaryConfig {
780            japanese: Some(ScriptDictionary {
781                version: Some("v1".to_string()),
782                entries,
783            }),
784            hangul: None,
785        };
786
787        std::fs::write(&path, serde_json::to_vec(&config).unwrap()).unwrap();
788        let loaded: DictionaryConfig =
789            serde_json::from_slice(&std::fs::read(&path).unwrap()).expect("should deserialize");
790
791        let mut index = InMemoryIndex::with_dictionary_config(loaded.clone());
792        index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
793
794        let hits = index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
795        assert!(
796            hits.iter().any(|h| h.doc_id == DOC_JP),
797            "expected dictionary-backed search hit, got {:?}",
798            hits
799        );
800        let mut fallback_index = InMemoryIndex::default();
801        fallback_index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
802        let fallback_hits =
803            fallback_index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
804        assert!(
805            fallback_hits.iter().any(|h| h.doc_id == DOC_JP),
806            "expected fallback tokenization to still recall doc, got {:?}",
807            fallback_hits
808        );
809    }
810
811    #[test]
812    fn id_like_tokens_match_exact() {
813        let mut index = InMemoryIndex::default();
814        let doc_id = "doc-id";
815        let id_like = "IKPeA9Zu9eo_pXlKWVFcf";
816
817        index.add_doc(INDEX, doc_id, id_like, true);
818
819        let hits = index.search_with_mode_hits(INDEX, id_like, SearchMode::Exact);
820        assert!(
821            hits.iter().any(|h| h.doc_id == doc_id),
822            "expected exact search to hit id-like token, got {:?}",
823            hits
824        );
825    }
826}