1mod base;
2mod index;
3mod ngram;
4mod pipeline;
5mod search;
6mod tokenizer;
7mod types;
8
9pub use types::{
10 DocData, InMemoryIndex, PositionEncoding, SNAPSHOT_VERSION, SearchHit, SearchMode,
11 SnapshotData, TermDomain,
12};
13
14pub use tokenizer::dictionary::{
15 DictionaryConfig, DictionaryLanguage, DictionarySegmenter, ScriptDictionary,
16 train_dictionary_config,
17};
18
19#[cfg(test)]
20mod tests {
21 use super::types::MatchedTerm;
22 use super::*;
23 use std::collections::HashSet;
24 use tempfile::tempdir;
25
26 const INDEX: &str = "test-index";
58 const DOC_CN: &str = "doc-cn";
59 const DOC_EN: &str = "doc-en";
60 const DOC_JP: &str = "doc-jp";
61
62 fn assert_contains_doc(results: &[(String, f64)], doc_id: &str) {
63 assert!(
64 results.iter().any(|(id, _)| id == doc_id),
65 "expected results to contain doc {doc_id}, got {:?}",
66 results
67 );
68 }
69
70 #[test]
71 fn chinese_full_pinyin_search() {
72 let mut index = InMemoryIndex::default();
73 index.add_doc(INDEX, DOC_CN, "你好世界", true);
74
75 let hits = index.search(INDEX, "nihao");
76 assert_contains_doc(&hits, DOC_CN);
77 }
78
79 #[test]
80 fn chinese_initials_search() {
81 let mut index = InMemoryIndex::default();
82 index.add_doc(INDEX, DOC_CN, "你好世界", true);
83
84 let hits = index.search(INDEX, "nh");
85 assert_contains_doc(&hits, DOC_CN);
86 }
87
88 #[test]
89 fn chinese_initials_prefix_search() {
90 let mut index = InMemoryIndex::default();
91 index.add_doc(INDEX, DOC_CN, "你好世界", true);
92
93 let hits = index.search(INDEX, "nhs");
94 assert_contains_doc(&hits, DOC_CN);
95
96 let exact = index.get_matches(INDEX, DOC_CN, "nhsj");
97 assert!(!exact.is_empty());
98 let hit = index
99 .search_hits(INDEX, "nhs")
100 .into_iter()
101 .find(|h| h.doc_id == DOC_CN)
102 .expect("expected hit for prefix query");
103 let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
104 assert!(!prefix_matches.is_empty());
105 assert!(
106 prefix_matches
107 .iter()
108 .any(|p| exact.iter().any(|e| e.0 == p.0)),
109 "prefix highlight should align to original start"
110 );
111 }
112
113 #[test]
114 fn chinese_full_pinyin_prefix_search() {
115 let mut index = InMemoryIndex::default();
116 index.add_doc(INDEX, DOC_CN, "你好世界", true);
117
118 let hits = index.search(INDEX, "nih");
119 assert_contains_doc(&hits, DOC_CN);
120
121 let exact = index.get_matches(INDEX, DOC_CN, "nihaoshijie");
122 assert!(!exact.is_empty());
123 let hit = index
124 .search_hits(INDEX, "nih")
125 .into_iter()
126 .find(|h| h.doc_id == DOC_CN)
127 .expect("expected hit for prefix query");
128 let prefix_matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
129 assert!(!prefix_matches.is_empty());
130 assert!(
131 prefix_matches
132 .iter()
133 .any(|p| exact.iter().any(|e| e.0 == p.0)),
134 "prefix highlight should align to original start"
135 );
136 }
137
138 #[test]
139 fn pinyin_fuzzy_search() {
140 let mut index = InMemoryIndex::default();
141 index.add_doc(INDEX, DOC_CN, "你好世界", true);
142
143 let hits = index.search_hits(INDEX, "nihap");
144 assert!(
145 hits.iter()
146 .any(|h| h.doc_id == DOC_CN && !h.matched_terms.is_empty()),
147 "expected matched pinyin term in fuzzy hits: {:?}",
148 hits.iter()
149 .map(|h| (&h.doc_id, &h.matched_terms))
150 .collect::<Vec<_>>()
151 );
152
153 let fuzzy_original = index.search_with_mode(INDEX, "nihap", SearchMode::Fuzzy);
154 assert!(
155 fuzzy_original.is_empty(),
156 "expected SearchMode::Fuzzy to only search original domain, got {:?}",
157 fuzzy_original
158 );
159 }
160
161 #[test]
162 fn english_fuzzy_search() {
163 let mut index = InMemoryIndex::default();
164 index.add_doc(INDEX, DOC_EN, "fuzzy search handles typos", true);
165
166 let hits = index.search_hits(INDEX, "fuzze");
167 assert!(hits.iter().any(|h| {
168 h.doc_id == DOC_EN
169 && h.matched_terms
170 .iter()
171 .any(|t| t.term == "fuzzy" && t.domain == TermDomain::Original)
172 }));
173 }
174
175 #[test]
176 fn english_query_splits_separators_and_lowercases() {
177 let mut index = InMemoryIndex::default();
178 index.add_doc(INDEX, DOC_EN, "MEMORY-INDEXER", true);
179
180 let hits = index.search_with_mode(INDEX, "memory-indexer", SearchMode::Exact);
181 assert_contains_doc(&hits, DOC_EN);
182 }
183
184 #[test]
185 fn cyrillic_term_matches_inside_phrase() {
186 let mut index = InMemoryIndex::default();
187 let doc_id = "doc-ru";
188 index.add_doc(INDEX, doc_id, "привет мир", true);
189
190 let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
191 assert_contains_doc(&hits, doc_id);
192 }
193
194 #[test]
195 fn greek_term_matches_inside_phrase() {
196 let mut index = InMemoryIndex::default();
197 let doc_id = "doc-gr";
198 index.add_doc(INDEX, doc_id, "γειά σου κόσμε", true);
199
200 let hits = index.search_with_mode(INDEX, "γειά", SearchMode::Exact);
201 assert_contains_doc(&hits, doc_id);
202 }
203
204 #[test]
205 fn cyrillic_term_matches_with_punctuation() {
206 let mut index = InMemoryIndex::default();
207 let doc_id = "doc-ru-punct";
208 index.add_doc(INDEX, doc_id, "привет, привет", true);
209
210 let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
211 assert_contains_doc(&hits, doc_id);
212 }
213
214 #[test]
215 fn armenian_term_matches_with_punctuation() {
216 let mut index = InMemoryIndex::default();
217 let doc_id = "doc-hy-punct";
218 index.add_doc(INDEX, doc_id, "բարեւ, աշխարհ", true);
219
220 let hits = index.search_with_mode(INDEX, "բարեւ", SearchMode::Exact);
221 assert_contains_doc(&hits, doc_id);
222 }
223
224 #[test]
225 fn fuzzy_search_allows_alphanumeric_terms() {
226 let mut index = InMemoryIndex::default();
227 index.add_doc(INDEX, DOC_EN, "version2 stable", true);
228
229 let hits = index.search_with_mode(INDEX, "versoin2", SearchMode::Fuzzy);
230 assert_contains_doc(&hits, DOC_EN);
231 }
232
233 #[test]
234 fn fuzzy_search_handles_separated_query_terms() {
235 let mut index = InMemoryIndex::default();
236 index.add_doc(INDEX, DOC_EN, "memory-indexer", true);
237
238 let hits = index.search_with_mode(INDEX, "memry-indexer", SearchMode::Fuzzy);
239 assert_contains_doc(&hits, DOC_EN);
240 }
241
242 #[test]
243 fn fuzzy_search_handles_short_terms() {
244 let mut index = InMemoryIndex::default();
245 index.add_doc(INDEX, DOC_EN, "go go", true);
246
247 let hits = index.search_with_mode(INDEX, "go", SearchMode::Fuzzy);
248 assert_contains_doc(&hits, DOC_EN);
249 }
250
251 #[test]
252 fn pinyin_highlight_uses_original_positions() {
253 let mut index = InMemoryIndex::default();
254 index.add_doc(INDEX, DOC_CN, "你好世界", true);
255
256 let direct = index.get_matches(INDEX, DOC_CN, "你好");
257 assert!(
258 !direct.is_empty(),
259 "expected direct chinese match to have positions"
260 );
261
262 let pinyin = index.get_matches(INDEX, DOC_CN, "nihao");
263 assert_eq!(pinyin, direct);
264 }
265
266 #[test]
267 fn highlight_prefers_original_for_mixed_scripts() {
268 let mut index = InMemoryIndex::default();
269 index.add_doc(INDEX, DOC_CN, "hello 世界", true);
270
271 let hits = index.search_hits(INDEX, "hello shi");
272 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
273 panic!("expected hit for mixed script query");
274 };
275 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
276 let content = index.get_doc(INDEX, DOC_CN).unwrap();
277 let slices: Vec<String> = matches
278 .iter()
279 .map(|(s, e)| utf16_slice(&content, *s, *e))
280 .collect();
281 assert!(
282 slices.iter().any(|s| s == "hello"),
283 "expected original spans for mixed script matches, got {:?}",
284 slices
285 );
286 if slices.iter().any(|s| s.chars().any(|c| !c.is_ascii())) {
287 assert!(
288 slices.iter().any(|s| s == "世界"),
289 "expected CJK spans for mixed script matches, got {:?}",
290 slices
291 );
292 }
293 }
294
295 #[test]
296 fn pinyin_prefix_highlight_uses_original_spans() {
297 let mut index = InMemoryIndex::default();
298 index.add_doc(INDEX, DOC_CN, "你好世界", true);
299
300 let hits = index.search_hits(INDEX, "nih");
301 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_CN) else {
302 panic!("expected prefix pinyin hit");
303 };
304 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
305 let direct = index.get_matches(INDEX, DOC_CN, "你好");
306 assert_eq!(
307 matches, direct,
308 "prefix highlight should map back to original spans"
309 );
310 }
311
312 #[test]
313 fn pinyin_highlight_handles_trailing_ascii() {
314 let mut index = InMemoryIndex::with_position_encoding(PositionEncoding::Utf16);
315 index.add_doc(
316 INDEX,
317 DOC_CN,
318 "美光将在全球内存供应短缺之际退出消费级内存业务",
319 true,
320 );
321
322 let hits = index.search_hits(INDEX, "neicun");
323 let hit = hits
324 .iter()
325 .find(|h| h.doc_id == DOC_CN)
326 .unwrap_or_else(|| panic!("expected hit for neicun, got {:?}", hits));
327 let matches = index.get_matches_for_matched_terms(INDEX, DOC_CN, &hit.matched_terms);
328 assert!(
329 !matches.is_empty(),
330 "expected highlight spans for pinyin match, got none"
331 );
332 let content = index.get_doc(INDEX, DOC_CN).unwrap();
333 let slices: Vec<String> = matches
334 .iter()
335 .map(|(s, e)| utf16_slice(&content, *s, *e))
336 .collect();
337 assert!(
338 slices.iter().all(|s| s == "内存"),
339 "expected highlights to stay on original term, got {:?}",
340 slices
341 );
342 }
343
344 fn utf16_slice(content: &str, start: u32, end: u32) -> String {
345 let mut utf16_pos = 0u32;
346 let mut start_byte = 0usize;
347 let mut end_byte = content.len();
348 for (idx, ch) in content.char_indices() {
349 if utf16_pos == start {
350 start_byte = idx;
351 }
352 utf16_pos += ch.len_utf16() as u32;
353 if utf16_pos == end {
354 end_byte = idx + ch.len_utf8();
355 break;
356 }
357 }
358 content[start_byte..end_byte].to_string()
359 }
360
361 #[test]
362 fn exact_search_prefers_original_terms() {
363 let mut index = InMemoryIndex::default();
364 index.add_doc(INDEX, DOC_EN, "nihao greeting", true);
365 index.add_doc(INDEX, DOC_CN, "你好世界", true);
366
367 let exact_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Exact);
368 assert_contains_doc(&exact_hits, DOC_EN);
369 assert!(
370 exact_hits.iter().all(|(id, _)| id == DOC_EN),
371 "expected exact search to ignore pinyin matches, got {:?}",
372 exact_hits
373 );
374
375 let auto_hits = index.search(INDEX, "nihao");
376 assert_contains_doc(&auto_hits, DOC_EN);
377 assert!(
378 auto_hits.iter().all(|(id, _)| id != DOC_CN),
379 "auto search should stop at exact matches"
380 );
381
382 let pinyin_hits = index.search_with_mode(INDEX, "nihao", SearchMode::Pinyin);
383 assert_contains_doc(&pinyin_hits, DOC_CN);
384 }
385
386 #[test]
387 fn japanese_ngram_search() {
388 let mut index = InMemoryIndex::default();
389 index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
390
391 let hits = index.search(INDEX, "こん");
392 assert_contains_doc(&hits, DOC_JP);
393
394 let matches = index.get_matches(INDEX, DOC_JP, "こん");
395 assert!(
396 !matches.is_empty(),
397 "expected offsets for japanese ngram matches"
398 );
399 }
400
401 #[test]
402 fn kanji_adjacent_to_kana_skips_pinyin() {
403 let mut index = InMemoryIndex::default();
404 index.add_doc(INDEX, DOC_JP, "東京へようこそ", true);
405
406 let hits = index.search_with_mode(INDEX, "dongjing", SearchMode::Pinyin);
407 assert!(
408 hits.is_empty(),
409 "kanji near kana should not derive pinyin, got {:?}",
410 hits
411 );
412 }
413
414 #[test]
415 fn exact_search_applies_minimum_should_match() {
416 let mut index = InMemoryIndex::default();
417 index.add_doc(INDEX, "doc-2-terms", "apple banana", true);
418 index.add_doc(INDEX, "doc-3-terms", "apple banana cherry", true);
419 index.add_doc(INDEX, "doc-1-term", "apple", true);
420
421 let hits = index.search_with_mode(INDEX, "apple banana cherry", SearchMode::Exact);
422
423 assert_contains_doc(&hits, "doc-2-terms");
424 assert_contains_doc(&hits, "doc-3-terms");
425 assert!(
426 !hits.iter().any(|(id, _)| id == "doc-1-term"),
427 "docs below minimum_should_match should be filtered out"
428 );
429
430 let score_two = hits
431 .iter()
432 .find(|(id, _)| id == "doc-2-terms")
433 .map(|(_, s)| *s)
434 .unwrap();
435 let score_three = hits
436 .iter()
437 .find(|(id, _)| id == "doc-3-terms")
438 .map(|(_, s)| *s)
439 .unwrap();
440 assert!(
441 score_three > score_two,
442 "more matched terms should score higher: {} vs {}",
443 score_three,
444 score_two
445 );
446 }
447
448 #[test]
449 fn pinyin_polyphonic_variants_for_short_tokens() {
450 let mut index = InMemoryIndex::default();
451 index.add_doc(INDEX, DOC_CN, "重庆火锅", true);
452
453 let hits_zhong = index.search_with_mode_hits(INDEX, "zhongqing", SearchMode::Pinyin);
454 assert!(
455 hits_zhong.iter().any(|h| h.doc_id == DOC_CN),
456 "expected zhongqing variant to hit"
457 );
458
459 let hits_chong = index.search_with_mode_hits(INDEX, "chongqing", SearchMode::Pinyin);
460 assert!(
461 hits_chong.iter().any(|h| h.doc_id == DOC_CN),
462 "expected chongqing variant to hit"
463 );
464
465 let matched_terms: Vec<MatchedTerm> = hits_zhong
466 .into_iter()
467 .find(|h| h.doc_id == DOC_CN)
468 .map(|h| h.matched_terms)
469 .unwrap_or_default();
470 assert!(
471 matched_terms
472 .iter()
473 .any(|t| t.term.contains("zhongqing") || t.term.contains("chongqing")),
474 "expected polyphonic pinyin variants in matched_terms, got {:?}",
475 matched_terms
476 );
477 }
478
479 #[test]
480 fn get_matches_for_terms_uses_matched_terms() {
481 let mut index = InMemoryIndex::default();
482 index.add_doc(INDEX, DOC_EN, "memoryIndexer", true);
483
484 let hits = index.search_hits(INDEX, "memryindexer");
485 let Some(hit) = hits.iter().find(|h| h.doc_id == DOC_EN) else {
486 panic!("expected hit for doc");
487 };
488 assert!(
489 hit.matched_terms
490 .iter()
491 .any(|t| t.term == "memoryindexer" && t.domain == TermDomain::Original),
492 "expected matched term memoryIndexer, got {:?}",
493 hit.matched_terms
494 );
495
496 let matches = index.get_matches_for_matched_terms(INDEX, DOC_EN, &hit.matched_terms);
497 assert!(!matches.is_empty(), "expected matches from matched_terms");
498 }
499
500 #[test]
501 fn fullwidth_pinyin_query_hits() {
502 let mut index = InMemoryIndex::default();
503 index.add_doc(INDEX, DOC_CN, "你好世界", true);
504
505 let hits = index.search_hits(INDEX, "NIHAO");
507 assert!(
508 hits.iter().any(|h| h.doc_id == DOC_CN),
509 "expected full-width pinyin query to hit, got {:?}",
510 hits.iter()
511 .map(|h| (&h.doc_id, &h.matched_terms))
512 .collect::<Vec<_>>()
513 );
514 let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
515 h.matched_terms
516 .iter()
517 .find(|t| t.domain == TermDomain::PinyinFull)
518 });
519 assert!(
520 matched.is_some(),
521 "expected matched pinyin full term, got {:?}",
522 hits.iter()
523 .find(|h| h.doc_id == DOC_CN)
524 .map(|h| h.matched_terms.clone())
525 );
526 }
527
528 #[test]
529 fn short_pinyin_fuzzy_hits() {
530 let mut index = InMemoryIndex::default();
531 index.add_doc(INDEX, DOC_CN, "你好", true);
532
533 let hits = index.search_hits(INDEX, "niha");
535 assert!(
536 hits.iter().any(|h| h.doc_id == DOC_CN),
537 "expected fuzzy pinyin hit for short query, got {:?}",
538 hits.iter()
539 .map(|h| (&h.doc_id, &h.matched_terms))
540 .collect::<Vec<_>>()
541 );
542 let matched = hits.iter().find(|h| h.doc_id == DOC_CN).and_then(|h| {
543 h.matched_terms
544 .iter()
545 .find(|t| matches!(t.domain, TermDomain::PinyinFull))
546 });
547 assert!(
548 matched.is_some(),
549 "expected matched pinyin term, got {:?}",
550 hits.iter()
551 .find(|h| h.doc_id == DOC_CN)
552 .map(|h| h.matched_terms.clone())
553 );
554 }
555
556 #[test]
557 fn non_ascii_auto_fuzzy_fallback() {
558 let mut index = InMemoryIndex::default();
559 index.add_doc(INDEX, DOC_CN, "北京大学", true);
560
561 let hits = index.search_hits(INDEX, "北景大学");
563 assert!(
564 hits.iter().any(|h| h.doc_id == DOC_CN),
565 "expected non-ascii fuzzy fallback to hit, got {:?}",
566 hits.iter()
567 .map(|h| (&h.doc_id, &h.matched_terms))
568 .collect::<Vec<_>>()
569 );
570 }
571
572 #[test]
573 fn mixed_script_query_hits_all_tokens() {
574 let mut index = InMemoryIndex::default();
575 index.add_doc(INDEX, DOC_CN, "hello 世界", true);
576
577 let hits = index.search_hits(INDEX, "hello 世界");
578 assert!(
579 hits.iter().any(|h| h.doc_id == DOC_CN),
580 "expected mixed-script query to hit doc, got {:?}",
581 hits.iter()
582 .map(|h| (&h.doc_id, &h.matched_terms))
583 .collect::<Vec<_>>()
584 );
585 let matched = hits
586 .iter()
587 .find(|h| h.doc_id == DOC_CN)
588 .map(|h| h.matched_terms.clone())
589 .unwrap_or_default();
590 assert!(
591 matched
592 .iter()
593 .any(|t| t.term == "hello" && t.domain == TermDomain::Original),
594 "expected matched original term hello, got {:?}",
595 matched
596 );
597 assert!(
598 matched.iter().any(|t| t.term == "世界"),
599 "expected matched CJK term 世界, got {:?}",
600 matched
601 );
602 }
603
604 #[test]
605 fn chinese_oov_fuzzy_recall() {
606 let mut index = InMemoryIndex::default();
607 index.add_doc(INDEX, DOC_CN, "明博", true);
608
609 let hits = index.search_hits(INDEX, "明搏");
611 assert!(
612 hits.iter().any(|h| h.doc_id == DOC_CN),
613 "expected OOV chinese fuzzy to hit, got {:?}",
614 hits.iter()
615 .map(|h| (&h.doc_id, &h.matched_terms))
616 .collect::<Vec<_>>()
617 );
618 }
619
620 #[test]
621 fn load_snapshot_restores_domains_and_lengths() {
622 let mut index = InMemoryIndex::default();
623 index.add_doc(INDEX, DOC_CN, "你好世界", true);
624
625 let snapshot = index
626 .get_snapshot_data(INDEX)
627 .expect("snapshot should exist");
628 let expected_total_len = snapshot.total_len;
629 let expected_domain_len = snapshot.domain_total_len.get(TermDomain::Original);
630
631 let mut restored = InMemoryIndex::default();
632 restored.load_snapshot(INDEX, snapshot);
633
634 let hits = restored.search_hits(INDEX, "nihap");
635 assert!(
636 hits.iter().any(|hit| hit.doc_id == DOC_CN),
637 "expected restored index to serve pinyin fuzzy hits"
638 );
639 let restored_state = restored
640 .indexes
641 .get(INDEX)
642 .expect("restored index state should exist");
643 assert_eq!(restored_state.total_len, expected_total_len);
644 assert_eq!(
645 restored_state.domain_total_len.get(TermDomain::Original),
646 expected_domain_len
647 );
648 }
649
650 #[test]
651 fn has_unpersisted_changes_tracks_dirty_and_deleted() {
652 let mut index = InMemoryIndex::default();
653 assert!(!index.has_unpersisted_changes(None));
654
655 index.add_doc(INDEX, DOC_EN, "pending doc", true);
656 assert!(index.has_unpersisted_changes(Some(INDEX)));
657 assert!(index.has_unpersisted_changes(None));
658
659 index.take_dirty_and_deleted();
660 assert!(!index.has_unpersisted_changes(Some(INDEX)));
661 assert!(!index.has_unpersisted_changes(None));
662
663 index.remove_doc(INDEX, DOC_EN);
664 assert!(index.has_unpersisted_changes(Some(INDEX)));
665 assert!(index.has_unpersisted_changes(None));
666 }
667
668 #[test]
669 fn load_snapshot_clears_pending_flags() {
670 let mut index = InMemoryIndex::default();
671 index.add_doc(INDEX, DOC_EN, "snapshot doc", true);
672
673 let snapshot = index
674 .get_snapshot_data(INDEX)
675 .expect("snapshot should exist");
676 assert!(index.has_unpersisted_changes(Some(INDEX)));
677
678 index.load_snapshot(INDEX, snapshot);
679 assert!(
680 !index.has_unpersisted_changes(Some(INDEX)),
681 "loading a snapshot should reset pending persistence markers"
682 );
683 }
684
685 #[test]
686 fn persist_if_dirty_skips_when_clean() {
687 let mut index = InMemoryIndex::default();
688 let mut called = false;
689
690 let persisted = index
691 .persist_if_dirty(INDEX, |_snapshot| -> Result<(), ()> {
692 called = true;
693 Ok(())
694 })
695 .unwrap();
696
697 assert!(!persisted, "clean index should skip persistence");
698 assert!(!called, "callback should not run when skipped");
699 }
700
701 #[test]
702 fn persist_if_dirty_persists_and_marks_clean_on_success() {
703 let mut index = InMemoryIndex::default();
704 index.add_doc(INDEX, DOC_EN, "persist me", true);
705
706 let mut called = false;
707 let persisted = index
708 .persist_if_dirty(INDEX, |snapshot| -> Result<(), ()> {
709 called = true;
710 assert_eq!(snapshot.docs.len(), 1, "snapshot should include doc");
711 Ok(())
712 })
713 .unwrap();
714
715 assert!(persisted, "dirty index should persist");
716 assert!(called, "callback should run on persistence");
717 assert!(
718 !index.has_unpersisted_changes(Some(INDEX)),
719 "successful persist should mark index clean"
720 );
721 }
722
723 #[test]
724 fn persist_if_dirty_keeps_pending_on_error() {
725 let mut index = InMemoryIndex::default();
726 index.add_doc(INDEX, DOC_EN, "persist error", true);
727
728 let err = index
729 .persist_if_dirty(INDEX, |_snapshot| -> Result<(), &'static str> {
730 Err("boom")
731 })
732 .unwrap_err();
733 assert_eq!(err, "boom");
734 assert!(
735 index.has_unpersisted_changes(Some(INDEX)),
736 "failed persist should leave index dirty"
737 );
738 }
739
740 #[test]
741 fn fuzzy_msm_filters_insufficient_matches() {
742 let mut index = InMemoryIndex::default();
743 index.add_doc(INDEX, "doc-long", "apple banana", true);
744 index.add_doc(INDEX, "doc-short", "apple", true);
745
746 let hits = index.search_with_mode_hits(INDEX, "applr banaan", SearchMode::Fuzzy);
747 assert!(
748 hits.iter().any(|h| h.doc_id == "doc-long"),
749 "expected fuzzy msm to keep doc with both terms, got {:?}",
750 hits
751 );
752 assert!(
753 hits.iter().all(|h| h.doc_id != "doc-short"),
754 "docs below min_should_match should be filtered out: {:?}",
755 hits
756 );
757 }
758
759 #[test]
760 fn short_cjk_fuzzy_recall_uses_2gram() {
761 let mut index = InMemoryIndex::default();
762 index.add_doc(INDEX, "doc-short-cjk", "方案", true);
763
764 let hits = index.search_hits(INDEX, "方桉");
765 assert!(
766 hits.iter().any(|h| h.doc_id == "doc-short-cjk"),
767 "expected 2-gram fuzzy recall for short CJK tokens, got {:?}",
768 hits
769 );
770 }
771
772 #[test]
773 fn dictionary_load_and_fallback() {
774 let dir = tempdir().unwrap();
775 let path = dir.path().join("dict.json");
776
777 let mut entries = HashSet::new();
778 entries.insert("こんにちは".to_string());
779 let config = DictionaryConfig {
780 japanese: Some(ScriptDictionary {
781 version: Some("v1".to_string()),
782 entries,
783 }),
784 hangul: None,
785 };
786
787 std::fs::write(&path, serde_json::to_vec(&config).unwrap()).unwrap();
788 let loaded: DictionaryConfig =
789 serde_json::from_slice(&std::fs::read(&path).unwrap()).expect("should deserialize");
790
791 let mut index = InMemoryIndex::with_dictionary_config(loaded.clone());
792 index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
793
794 let hits = index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
795 assert!(
796 hits.iter().any(|h| h.doc_id == DOC_JP),
797 "expected dictionary-backed search hit, got {:?}",
798 hits
799 );
800 let mut fallback_index = InMemoryIndex::default();
801 fallback_index.add_doc(INDEX, DOC_JP, "こんにちは世界", true);
802 let fallback_hits =
803 fallback_index.search_with_mode_hits(INDEX, "こんにちは", SearchMode::Exact);
804 assert!(
805 fallback_hits.iter().any(|h| h.doc_id == DOC_JP),
806 "expected fallback tokenization to still recall doc, got {:?}",
807 fallback_hits
808 );
809 }
810
811 #[test]
812 fn id_like_tokens_match_exact() {
813 let mut index = InMemoryIndex::default();
814 let doc_id = "doc-id";
815 let id_like = "IKPeA9Zu9eo_pXlKWVFcf";
816
817 index.add_doc(INDEX, doc_id, id_like, true);
818
819 let hits = index.search_with_mode_hits(INDEX, id_like, SearchMode::Exact);
820 assert!(
821 hits.iter().any(|h| h.doc_id == doc_id),
822 "expected exact search to hit id-like token, got {:?}",
823 hits
824 );
825 }
826}