1use std::collections::{BTreeMap, BTreeSet, HashMap};
36use std::path::{Path, PathBuf};
37
38use fst::automaton::{Levenshtein, Str};
39use fst::{Automaton, IntoStreamer, Map, MapBuilder, Streamer};
40use regex_automata::Anchored;
43use regex_automata::dfa::Automaton as _;
44use regex_automata::dfa::{StartKind, dense};
45use regex_automata::util::primitives::StateID;
46
47const MANIFEST_VERSION: u64 = 1;
49
50const MAX_FUZZY: u32 = 2;
53
54pub fn tokenize(text: &str) -> Vec<String> {
61 let mut out = Vec::new();
62 let mut cur = String::new();
63 for ch in text.chars() {
64 if ch.is_alphanumeric() {
65 cur.extend(ch.to_lowercase());
66 } else if !cur.is_empty() {
67 out.push(std::mem::take(&mut cur));
68 }
69 }
70 if !cur.is_empty() {
71 out.push(cur);
72 }
73 out
74}
75
76fn put_uvarint(buf: &mut Vec<u8>, mut val: u64) {
80 loop {
81 let mut byte = (val & 0x7f) as u8;
82 val >>= 7;
83 if val != 0 {
84 byte |= 0x80;
85 }
86 buf.push(byte);
87 if val == 0 {
88 break;
89 }
90 }
91}
92
93fn get_uvarint(buf: &[u8], pos: &mut usize) -> Option<u64> {
95 let mut val = 0u64;
96 let mut shift = 0u32;
97 loop {
98 let byte = *buf.get(*pos)?;
99 *pos += 1;
100 val |= u64::from(byte & 0x7f) << shift;
101 if byte & 0x80 == 0 {
102 return Some(val);
103 }
104 shift += 7;
105 if shift >= 64 {
106 return None;
107 }
108 }
109}
110
111fn encode_postings(buf: &mut Vec<u8>, postings: &[(u64, u32)]) -> u64 {
114 let offset = buf.len() as u64;
115 put_uvarint(buf, postings.len() as u64);
116 for &(doc, tf) in postings {
117 put_uvarint(buf, doc);
118 put_uvarint(buf, u64::from(tf));
119 }
120 offset
121}
122
123fn decode_postings(blob: &[u8], offset: u64) -> Vec<(u64, u32)> {
125 let mut pos = offset as usize;
126 let Some(n) = get_uvarint(blob, &mut pos) else {
127 return Vec::new();
128 };
129 let mut out = Vec::with_capacity(n as usize);
130 for _ in 0..n {
131 let (Some(doc), Some(tf)) = (get_uvarint(blob, &mut pos), get_uvarint(blob, &mut pos))
132 else {
133 break;
134 };
135 out.push((doc, tf as u32));
136 }
137 out
138}
139
140#[derive(Debug, Clone, PartialEq, Eq)]
145pub struct FileStat {
146 pub key: String,
148 pub path: PathBuf,
150 pub mtime_ns: u64,
152 pub size: u64,
154}
155
156#[derive(Debug, Clone, Default, PartialEq, Eq)]
159pub struct DocSource {
160 pub title: String,
162 pub type_: String,
164 pub tags: Vec<String>,
166 pub text: String,
168}
169
170#[derive(Debug, Default, Clone, PartialEq, Eq)]
172pub struct UpdateReport {
173 pub added: usize,
174 pub changed: usize,
175 pub removed: usize,
176 pub wrote_segment: bool,
178}
179
180impl UpdateReport {
181 pub fn is_empty(&self) -> bool {
183 self.added == 0 && self.changed == 0 && self.removed == 0
184 }
185}
186
187#[derive(Debug, Clone, PartialEq, Eq)]
191struct DocMeta {
192 key: String,
193 title: String,
194 type_: String,
195 tags: Vec<String>,
196 len: u32,
198}
199
200#[derive(Debug, Clone, PartialEq, Eq)]
202struct FileRec {
203 doc: u64,
204 mtime_ns: u64,
205 size: u64,
206}
207
208#[derive(Debug, Clone, Default)]
210struct Manifest {
211 next_doc: u64,
212 segments: Vec<u32>,
213 files: BTreeMap<String, FileRec>,
214 docs: BTreeMap<u64, DocMeta>,
215 deleted: BTreeSet<u64>,
216}
217
218impl Manifest {
219 fn to_json(&self) -> serde_json::Value {
220 let files: serde_json::Map<String, serde_json::Value> = self
221 .files
222 .iter()
223 .map(|(k, r)| {
224 (
225 k.clone(),
226 serde_json::json!({ "doc": r.doc, "mtime_ns": r.mtime_ns, "size": r.size }),
227 )
228 })
229 .collect();
230 let docs: serde_json::Map<String, serde_json::Value> = self
231 .docs
232 .iter()
233 .map(|(id, m)| {
234 (
235 id.to_string(),
236 serde_json::json!({
237 "key": m.key,
238 "title": m.title,
239 "type": m.type_,
240 "tags": m.tags,
241 "len": m.len,
242 }),
243 )
244 })
245 .collect();
246 serde_json::json!({
247 "version": MANIFEST_VERSION,
248 "next_doc": self.next_doc,
249 "segments": self.segments,
250 "files": files,
251 "docs": docs,
252 "deleted": self.deleted.iter().collect::<Vec<_>>(),
253 })
254 }
255
256 fn from_json(v: &serde_json::Value) -> Result<Manifest, String> {
257 let obj = v.as_object().ok_or("manifest is not an object")?;
258 let mut m = Manifest {
259 next_doc: obj.get("next_doc").and_then(|x| x.as_u64()).unwrap_or(0),
260 ..Manifest::default()
261 };
262 if let Some(arr) = obj.get("segments").and_then(|x| x.as_array()) {
263 m.segments = arr
264 .iter()
265 .filter_map(|x| x.as_u64().map(|n| n as u32))
266 .collect();
267 }
268 if let Some(files) = obj.get("files").and_then(|x| x.as_object()) {
269 for (k, r) in files {
270 let doc = r.get("doc").and_then(|x| x.as_u64()).unwrap_or(0);
271 let mtime_ns = r.get("mtime_ns").and_then(|x| x.as_u64()).unwrap_or(0);
272 let size = r.get("size").and_then(|x| x.as_u64()).unwrap_or(0);
273 m.files.insert(
274 k.clone(),
275 FileRec {
276 doc,
277 mtime_ns,
278 size,
279 },
280 );
281 }
282 }
283 if let Some(docs) = obj.get("docs").and_then(|x| x.as_object()) {
284 for (id, d) in docs {
285 let Ok(id) = id.parse::<u64>() else { continue };
286 let tags = d
287 .get("tags")
288 .and_then(|x| x.as_array())
289 .map(|a| {
290 a.iter()
291 .filter_map(|t| t.as_str().map(String::from))
292 .collect()
293 })
294 .unwrap_or_default();
295 m.docs.insert(
296 id,
297 DocMeta {
298 key: d
299 .get("key")
300 .and_then(|x| x.as_str())
301 .unwrap_or("")
302 .to_string(),
303 title: d
304 .get("title")
305 .and_then(|x| x.as_str())
306 .unwrap_or("")
307 .to_string(),
308 type_: d
309 .get("type")
310 .and_then(|x| x.as_str())
311 .unwrap_or("")
312 .to_string(),
313 tags,
314 len: d.get("len").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
315 },
316 );
317 }
318 }
319 if let Some(arr) = obj.get("deleted").and_then(|x| x.as_array()) {
320 m.deleted = arr.iter().filter_map(|x| x.as_u64()).collect();
321 }
322 Ok(m)
323 }
324}
325
326#[derive(Debug, Clone, PartialEq, Eq)]
330pub enum QueryTerm {
331 Exact(String),
333 Prefix(String),
335 Fuzzy(String, u32),
337 Regex(String),
339}
340
341pub fn parse_query(query: &str) -> Vec<QueryTerm> {
349 let mut out = Vec::new();
350 for tok in query.split_whitespace() {
351 if tok.len() >= 2 && tok.starts_with('/') && tok.ends_with('/') {
353 let inner = &tok[1..tok.len() - 1];
354 if !inner.is_empty() {
355 out.push(QueryTerm::Regex(inner.to_string()));
356 }
357 continue;
358 }
359 enum Op {
362 Exact,
363 Prefix,
364 Fuzzy(u32),
365 }
366 let (core, op) = if let Some(tilde) = tok.rfind('~') {
367 let dist = tok[tilde + 1..]
368 .parse::<u32>()
369 .unwrap_or(1)
370 .clamp(1, MAX_FUZZY);
371 (&tok[..tilde], Op::Fuzzy(dist))
372 } else if let Some(head) = tok.strip_suffix('*') {
373 (head, Op::Prefix)
374 } else {
375 (tok, Op::Exact)
376 };
377 let mut toks = tokenize(core);
378 if let Some(last) = toks.pop() {
379 for t in toks {
380 out.push(QueryTerm::Exact(t));
381 }
382 out.push(match op {
383 Op::Exact => QueryTerm::Exact(last),
384 Op::Prefix => QueryTerm::Prefix(last),
385 Op::Fuzzy(d) => QueryTerm::Fuzzy(last, d),
386 });
387 }
388 }
389 out
390}
391
392struct Segment {
396 map: Map<Vec<u8>>,
397 pos: Vec<u8>,
398}
399
400fn collect_matches<A: Automaton>(
404 segments: &[Segment],
405 aut: &A,
406 deleted: &BTreeSet<u64>,
407 merged: &mut HashMap<String, Vec<(u64, u32)>>,
408) {
409 for seg in segments {
410 let mut stream = seg.map.search(aut).into_stream();
411 while let Some((key, off)) = stream.next() {
412 let live: Vec<(u64, u32)> = decode_postings(&seg.pos, off)
413 .into_iter()
414 .filter(|(doc, _)| !deleted.contains(doc))
415 .collect();
416 if !live.is_empty()
417 && let Ok(term) = std::str::from_utf8(key)
418 {
419 merged.entry(term.to_string()).or_default().extend(live);
420 }
421 }
422 }
423}
424
425struct DfaAutomaton {
430 dfa: dense::DFA<Vec<u32>>,
431 start: StateID,
432}
433
434impl DfaAutomaton {
435 fn new(pattern: &str) -> Result<DfaAutomaton, String> {
436 let dfa = dense::Builder::new()
437 .configure(dense::Config::new().start_kind(StartKind::Anchored))
438 .build(pattern)
439 .map_err(|e| format!("invalid regex: {e}"))?;
440 let start = dfa
441 .universal_start_state(Anchored::Yes)
442 .ok_or("regex start depends on look-around, unsupported here")?;
443 Ok(DfaAutomaton { dfa, start })
444 }
445}
446
447impl Automaton for DfaAutomaton {
448 type State = StateID;
449
450 fn start(&self) -> StateID {
451 self.start
452 }
453
454 fn is_match(&self, state: &StateID) -> bool {
455 self.dfa.is_match_state(self.dfa.next_eoi_state(*state))
457 }
458
459 fn can_match(&self, state: &StateID) -> bool {
460 !self.dfa.is_dead_state(*state)
462 }
463
464 fn accept(&self, state: &StateID, byte: u8) -> StateID {
465 self.dfa.next_state(*state, byte)
466 }
467}
468
469#[derive(Debug, Clone, PartialEq)]
473pub struct SearchHit {
474 pub key: String,
476 pub title: String,
477 pub type_: String,
478 pub tags: Vec<String>,
479 pub score: f32,
480}
481
482pub struct Index {
484 dir: PathBuf,
485 manifest: Manifest,
486}
487
488impl Index {
489 pub fn open(dir: &Path) -> Result<Index, String> {
493 let manifest_path = dir.join("manifest.json");
494 let manifest = match std::fs::read_to_string(&manifest_path) {
495 Ok(text) => {
496 let v: serde_json::Value = serde_json::from_str(&text)
497 .map_err(|e| format!("{}: {e}", manifest_path.display()))?;
498 Manifest::from_json(&v)?
499 }
500 Err(_) => Manifest::default(),
501 };
502 Ok(Index {
503 dir: dir.to_path_buf(),
504 manifest,
505 })
506 }
507
508 pub fn doc_count(&self) -> usize {
510 self.manifest.docs.len()
511 }
512
513 pub fn segment_count(&self) -> usize {
515 self.manifest.segments.len()
516 }
517
518 pub fn tombstone_count(&self) -> usize {
520 self.manifest.deleted.len()
521 }
522
523 pub fn pending(&self, current: &[FileStat]) -> (usize, usize, usize) {
527 let present: BTreeSet<&str> = current.iter().map(|f| f.key.as_str()).collect();
528 let removed = self
529 .manifest
530 .files
531 .keys()
532 .filter(|k| !present.contains(k.as_str()))
533 .count();
534 let (mut added, mut changed) = (0, 0);
535 for f in current {
536 match self.manifest.files.get(&f.key) {
537 None => added += 1,
538 Some(r) if r.mtime_ns != f.mtime_ns || r.size != f.size => changed += 1,
539 _ => {}
540 }
541 }
542 (added, changed, removed)
543 }
544
545 fn seg_paths(&self, num: u32) -> (PathBuf, PathBuf) {
546 let base = format!("seg-{num:05}");
547 (
548 self.dir.join(format!("{base}.fst")),
549 self.dir.join(format!("{base}.pos")),
550 )
551 }
552
553 fn load_segment(&self, num: u32) -> Result<Segment, String> {
554 let (fst_path, pos_path) = self.seg_paths(num);
555 let fst_bytes =
556 std::fs::read(&fst_path).map_err(|e| format!("{}: {e}", fst_path.display()))?;
557 let pos = std::fs::read(&pos_path).map_err(|e| format!("{}: {e}", pos_path.display()))?;
558 let map = Map::new(fst_bytes).map_err(|e| format!("{}: {e}", fst_path.display()))?;
559 Ok(Segment { map, pos })
560 }
561
562 pub fn update<F>(&mut self, current: &[FileStat], load: F) -> Result<UpdateReport, String>
567 where
568 F: Fn(&FileStat) -> Result<DocSource, String>,
569 {
570 let mut report = UpdateReport::default();
571 let present: BTreeSet<&str> = current.iter().map(|f| f.key.as_str()).collect();
572
573 let removed_keys: Vec<String> = self
575 .manifest
576 .files
577 .keys()
578 .filter(|k| !present.contains(k.as_str()))
579 .cloned()
580 .collect();
581 for key in removed_keys {
582 if let Some(rec) = self.manifest.files.remove(&key) {
583 self.manifest.docs.remove(&rec.doc);
584 self.manifest.deleted.insert(rec.doc);
585 report.removed += 1;
586 }
587 }
588
589 let mut postings: BTreeMap<String, BTreeMap<u64, u32>> = BTreeMap::new();
592 for f in current {
593 let unchanged = self
594 .manifest
595 .files
596 .get(&f.key)
597 .is_some_and(|r| r.mtime_ns == f.mtime_ns && r.size == f.size);
598 if unchanged {
599 continue;
600 }
601 let is_change = self.manifest.files.contains_key(&f.key);
602 if let Some(old) = self.manifest.files.get(&f.key) {
604 self.manifest.docs.remove(&old.doc);
605 self.manifest.deleted.insert(old.doc);
606 }
607 let src = load(f)?;
608 let doc = self.manifest.next_doc;
609 self.manifest.next_doc += 1;
610 let terms = tokenize(&format!(
611 "{} {} {} {}",
612 src.title,
613 src.type_,
614 src.tags.join(" "),
615 src.text
616 ));
617 let len = terms.len() as u32;
618 for term in terms {
619 *postings.entry(term).or_default().entry(doc).or_insert(0) += 1;
620 }
621 self.manifest.docs.insert(
622 doc,
623 DocMeta {
624 key: f.key.clone(),
625 title: src.title,
626 type_: src.type_,
627 tags: src.tags,
628 len,
629 },
630 );
631 self.manifest.files.insert(
632 f.key.clone(),
633 FileRec {
634 doc,
635 mtime_ns: f.mtime_ns,
636 size: f.size,
637 },
638 );
639 if is_change {
640 report.changed += 1;
641 } else {
642 report.added += 1;
643 }
644 }
645
646 if !postings.is_empty() {
647 let num = self
648 .manifest
649 .segments
650 .iter()
651 .copied()
652 .max()
653 .map(|n| n + 1)
654 .unwrap_or(0);
655 self.write_segment(num, &postings)?;
656 self.manifest.segments.push(num);
657 report.wrote_segment = true;
658 }
659 Ok(report)
660 }
661
662 fn write_segment(
664 &self,
665 num: u32,
666 postings: &BTreeMap<String, BTreeMap<u64, u32>>,
667 ) -> Result<(), String> {
668 std::fs::create_dir_all(&self.dir).map_err(|e| format!("{}: {e}", self.dir.display()))?;
669 let (fst_path, pos_path) = self.seg_paths(num);
670 let mut pos_blob = Vec::new();
671 let wtr = std::io::BufWriter::new(
672 std::fs::File::create(&fst_path).map_err(|e| format!("{}: {e}", fst_path.display()))?,
673 );
674 let mut builder = MapBuilder::new(wtr).map_err(|e| e.to_string())?;
675 for (term, docs) in postings {
677 let list: Vec<(u64, u32)> = docs.iter().map(|(&d, &tf)| (d, tf)).collect();
678 let off = encode_postings(&mut pos_blob, &list);
679 builder
680 .insert(term.as_bytes(), off)
681 .map_err(|e| e.to_string())?;
682 }
683 builder.finish().map_err(|e| e.to_string())?;
684 std::fs::write(&pos_path, &pos_blob).map_err(|e| format!("{}: {e}", pos_path.display()))?;
685 Ok(())
686 }
687
688 pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>, String> {
691 let terms = parse_query(query);
692 if terms.is_empty() {
693 return Ok(Vec::new());
694 }
695 let segments: Vec<Segment> = self
696 .manifest
697 .segments
698 .iter()
699 .map(|&n| self.load_segment(n))
700 .collect::<Result<_, _>>()?;
701 let n_docs = self.manifest.docs.len().max(1) as f32;
702
703 let deleted = &self.manifest.deleted;
704 let mut scores: HashMap<u64, f32> = HashMap::new();
705 for qt in &terms {
706 let mut merged: HashMap<String, Vec<(u64, u32)>> = HashMap::new();
710 match qt {
711 QueryTerm::Exact(t) => {
712 collect_matches(&segments, &Str::new(t), deleted, &mut merged)
713 }
714 QueryTerm::Prefix(t) => {
715 collect_matches(&segments, &Str::new(t).starts_with(), deleted, &mut merged)
716 }
717 QueryTerm::Fuzzy(t, dist) => {
718 if let Ok(lev) = Levenshtein::new(t, *dist) {
719 collect_matches(&segments, &lev, deleted, &mut merged);
720 }
721 }
722 QueryTerm::Regex(pat) => {
723 if let Ok(dfa) = DfaAutomaton::new(pat) {
724 collect_matches(&segments, &dfa, deleted, &mut merged);
725 }
726 }
727 }
728 for list in merged.values() {
729 let df = list.len().max(1) as f32;
730 let idf = (1.0 + n_docs / df).ln();
731 for &(doc, tf) in list {
732 *scores.entry(doc).or_insert(0.0) += idf * (1.0 + (tf as f32).ln());
733 }
734 }
735 }
736
737 let mut hits: Vec<SearchHit> = scores
738 .into_iter()
739 .filter_map(|(doc, score)| {
740 self.manifest.docs.get(&doc).map(|m| SearchHit {
741 key: m.key.clone(),
742 title: m.title.clone(),
743 type_: m.type_.clone(),
744 tags: m.tags.clone(),
745 score,
746 })
747 })
748 .collect();
749 hits.sort_by(|a, b| {
751 b.score
752 .partial_cmp(&a.score)
753 .unwrap_or(std::cmp::Ordering::Equal)
754 .then_with(|| a.key.cmp(&b.key))
755 });
756 hits.truncate(limit);
757 Ok(hits)
758 }
759
760 pub fn condense(&mut self) -> Result<bool, String> {
765 if self.manifest.segments.len() <= 1 && self.manifest.deleted.is_empty() {
766 return Ok(false);
767 }
768 let old_segments = self.manifest.segments.clone();
769 let segments: Vec<Segment> = old_segments
770 .iter()
771 .map(|&n| self.load_segment(n))
772 .collect::<Result<_, _>>()?;
773
774 let mut postings: BTreeMap<String, BTreeMap<u64, u32>> = BTreeMap::new();
776 for seg in &segments {
777 let mut s = seg.map.stream();
778 while let Some((term_bytes, off)) = s.next() {
779 let Ok(term) = std::str::from_utf8(term_bytes) else {
780 continue;
781 };
782 for (doc, tf) in decode_postings(&seg.pos, off) {
783 if self.manifest.deleted.contains(&doc) {
784 continue;
785 }
786 *postings
787 .entry(term.to_string())
788 .or_default()
789 .entry(doc)
790 .or_insert(0) += tf;
791 }
792 }
793 }
794
795 let num = old_segments.iter().copied().max().unwrap_or(0) + 1;
796 if !postings.is_empty() {
797 self.write_segment(num, &postings)?;
798 self.manifest.segments = vec![num];
799 } else {
800 self.manifest.segments.clear();
801 }
802 self.manifest.deleted.clear();
803 for old in old_segments {
804 let (fst_path, pos_path) = self.seg_paths(old);
805 let _ = std::fs::remove_file(fst_path);
806 let _ = std::fs::remove_file(pos_path);
807 }
808 Ok(true)
809 }
810
811 pub fn reset(&mut self) {
814 for num in std::mem::take(&mut self.manifest.segments) {
815 let (fst_path, pos_path) = self.seg_paths(num);
816 let _ = std::fs::remove_file(fst_path);
817 let _ = std::fs::remove_file(pos_path);
818 }
819 self.manifest = Manifest::default();
820 }
821
822 pub fn save(&self) -> Result<(), String> {
824 std::fs::create_dir_all(&self.dir).map_err(|e| format!("{}: {e}", self.dir.display()))?;
825 let path = self.dir.join("manifest.json");
826 let text =
827 serde_json::to_string_pretty(&self.manifest.to_json()).map_err(|e| e.to_string())?;
828 std::fs::write(&path, text).map_err(|e| format!("{}: {e}", path.display()))
829 }
830}
831
832#[cfg(test)]
833mod tests {
834 use super::*;
835 use std::sync::atomic::{AtomicU32, Ordering};
836
837 static TAG: AtomicU32 = AtomicU32::new(0);
838
839 fn scratch() -> PathBuf {
841 let n = TAG.fetch_add(1, Ordering::Relaxed);
842 let dir = std::env::temp_dir().join(format!("ct-okfindex-{}-{n}", std::process::id()));
843 let _ = std::fs::remove_dir_all(&dir);
844 std::fs::create_dir_all(&dir).unwrap();
845 dir
846 }
847
848 fn stat(key: &str, mtime_ns: u64) -> FileStat {
849 FileStat {
850 key: key.to_string(),
851 path: PathBuf::from(key),
852 mtime_ns,
853 size: mtime_ns,
854 }
855 }
856
857 fn doc(title: &str, type_: &str, tags: &[&str], text: &str) -> DocSource {
858 DocSource {
859 title: title.to_string(),
860 type_: type_.to_string(),
861 tags: tags.iter().map(|s| s.to_string()).collect(),
862 text: text.to_string(),
863 }
864 }
865
866 #[test]
867 fn tokenize_lowercases_and_splits_on_nonalnum() {
868 assert_eq!(
869 tokenize("Hello, World! foo_bar"),
870 ["hello", "world", "foo", "bar"]
871 );
872 assert_eq!(tokenize(" "), Vec::<String>::new());
873 }
874
875 #[test]
876 fn parse_query_recognizes_all_modes() {
877 let q = parse_query("plain data* typo~ deep~2 /sch.*ma/");
878 assert_eq!(q[0], QueryTerm::Exact("plain".into()));
879 assert_eq!(q[1], QueryTerm::Prefix("data".into()));
880 assert_eq!(q[2], QueryTerm::Fuzzy("typo".into(), 1));
881 assert_eq!(q[3], QueryTerm::Fuzzy("deep".into(), 2));
882 assert_eq!(q[4], QueryTerm::Regex("sch.*ma".into()));
883 }
884
885 #[test]
886 fn varint_roundtrips() {
887 for v in [0u64, 1, 127, 128, 300, 16384, u32::MAX as u64, u64::MAX] {
888 let mut buf = Vec::new();
889 put_uvarint(&mut buf, v);
890 let mut pos = 0;
891 assert_eq!(get_uvarint(&buf, &mut pos), Some(v));
892 assert_eq!(pos, buf.len());
893 }
894 }
895
896 #[test]
897 fn index_search_exact_prefix_and_fuzzy() {
898 let dir = scratch();
899 let mut idx = Index::open(&dir).unwrap();
900 let files = [stat("a.md", 1), stat("b.md", 1)];
901 idx.update(&files, |f| {
902 Ok(match f.key.as_str() {
903 "a.md" => doc(
904 "Customers",
905 "BigQuery Table",
906 &["pii"],
907 "the customers dimension table",
908 ),
909 _ => doc(
910 "Orders",
911 "BigQuery Table",
912 &["core"],
913 "the orders fact table schema",
914 ),
915 })
916 })
917 .unwrap();
918 idx.save().unwrap();
919
920 let hits = idx.search("customers", 10).unwrap();
922 assert_eq!(hits.len(), 1);
923 assert_eq!(hits[0].key, "a.md");
924
925 let hits = idx.search("custom*", 10).unwrap();
927 assert_eq!(
928 hits.iter().map(|h| h.key.as_str()).collect::<Vec<_>>(),
929 ["a.md"]
930 );
931
932 let hits = idx.search("ordrs~", 10).unwrap();
934 assert_eq!(hits[0].key, "b.md");
935
936 let idx2 = Index::open(&dir).unwrap();
938 assert_eq!(idx2.doc_count(), 2);
939 assert_eq!(idx2.search("schema", 10).unwrap()[0].key, "b.md");
940 }
941
942 #[test]
943 fn regex_mode_matches_substrings_via_the_dfa_adapter() {
944 let dir = scratch();
945 let mut idx = Index::open(&dir).unwrap();
946 idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
947 Ok(match f.key.as_str() {
948 "a.md" => doc("Orders", "Table", &[], "the orders fact table"),
949 _ => doc("Customers", "Table", &[], "the customers dimension"),
950 })
951 })
952 .unwrap();
953
954 let hits = idx.search("/.*omer.*/", 10).unwrap();
956 assert_eq!(
957 hits.iter().map(|h| h.key.as_str()).collect::<Vec<_>>(),
958 ["b.md"]
959 );
960
961 let hits = idx.search("/orders/", 10).unwrap();
963 assert_eq!(hits[0].key, "a.md");
964
965 assert!(idx.search("/zzz.*/", 10).unwrap().is_empty());
967 }
968
969 #[test]
970 fn incremental_update_supersedes_changed_and_drops_removed() {
971 let dir = scratch();
972 let mut idx = Index::open(&dir).unwrap();
973 idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
974 Ok(if f.key == "a.md" {
975 doc("Alpha", "Note", &[], "alpha content markalpha")
976 } else {
977 doc("Beta", "Note", &[], "beta content markbeta")
978 })
979 })
980 .unwrap();
981
982 let report = idx
984 .update(&[stat("a.md", 2)], |_| {
985 Ok(doc("Alpha", "Note", &[], "rewritten markgamma"))
986 })
987 .unwrap();
988 assert_eq!((report.changed, report.removed, report.added), (1, 1, 0));
989 assert_eq!(idx.doc_count(), 1);
990 assert_eq!(idx.segment_count(), 2); assert_eq!(idx.tombstone_count(), 2); assert!(idx.search("markalpha", 10).unwrap().is_empty());
995 assert!(idx.search("markbeta", 10).unwrap().is_empty());
996 assert_eq!(idx.search("markgamma", 10).unwrap()[0].key, "a.md");
997
998 let report = idx
1000 .update(&[stat("a.md", 2)], |_| {
1001 panic!("should not load unchanged file")
1002 })
1003 .unwrap();
1004 assert!(report.is_empty());
1005 assert_eq!(idx.segment_count(), 2);
1006 }
1007
1008 #[test]
1009 fn condense_merges_segments_and_drops_tombstones() {
1010 let dir = scratch();
1011 let mut idx = Index::open(&dir).unwrap();
1012 idx.update(&[stat("a.md", 1)], |_| {
1013 Ok(doc("A", "Note", &[], "first markone"))
1014 })
1015 .unwrap();
1016 idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
1017 Ok(if f.key == "b.md" {
1018 doc("B", "Note", &[], "second marktwo")
1019 } else {
1020 doc("A", "Note", &[], "first markone")
1021 })
1022 })
1023 .unwrap();
1024 idx.update(&[stat("a.md", 9), stat("b.md", 1)], |f| {
1026 Ok(if f.key == "a.md" {
1027 doc("A", "Note", &[], "third markthree")
1028 } else {
1029 doc("B", "Note", &[], "second marktwo")
1030 })
1031 })
1032 .unwrap();
1033 assert!(idx.segment_count() >= 2);
1034 assert!(idx.tombstone_count() >= 1);
1035
1036 assert!(idx.condense().unwrap());
1037 assert_eq!(idx.segment_count(), 1);
1038 assert_eq!(idx.tombstone_count(), 0);
1039 assert_eq!(idx.doc_count(), 2);
1040
1041 assert_eq!(idx.search("markthree", 10).unwrap()[0].key, "a.md");
1043 assert_eq!(idx.search("marktwo", 10).unwrap()[0].key, "b.md");
1044 assert!(idx.search("markone", 10).unwrap().is_empty());
1045
1046 let leftover = std::fs::read_dir(&dir)
1048 .unwrap()
1049 .filter_map(|e| e.ok())
1050 .filter(|e| e.path().extension().is_some_and(|x| x == "fst"))
1051 .count();
1052 assert_eq!(leftover, 1);
1053 }
1054
1055 #[test]
1056 fn pending_counts_added_changed_removed_without_mutating() {
1057 let dir = scratch();
1058 let mut idx = Index::open(&dir).unwrap();
1059 idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
1060 Ok(doc(&f.key, "Note", &[], "content"))
1061 })
1062 .unwrap();
1063 assert_eq!(
1065 idx.pending(&[stat("a.md", 1), stat("b.md", 2), stat("c.md", 1)]),
1066 (1, 1, 0)
1067 );
1068 assert_eq!(idx.pending(&[stat("a.md", 1)]), (0, 0, 1));
1070 assert_eq!(idx.doc_count(), 2);
1072 }
1073
1074 #[test]
1075 fn reset_clears_then_reindexes_from_scratch() {
1076 let dir = scratch();
1077 let mut idx = Index::open(&dir).unwrap();
1078 idx.update(&[stat("a.md", 1)], |_| {
1079 Ok(doc("A", "Note", &[], "unique markx"))
1080 })
1081 .unwrap();
1082 idx.save().unwrap();
1083 assert_eq!(idx.doc_count(), 1);
1084
1085 idx.reset();
1086 assert_eq!((idx.doc_count(), idx.segment_count()), (0, 0));
1087 assert!(idx.search("markx", 10).unwrap().is_empty());
1088
1089 idx.update(&[stat("a.md", 1)], |_| {
1091 Ok(doc("A", "Note", &[], "unique marky"))
1092 })
1093 .unwrap();
1094 assert_eq!(idx.search("marky", 10).unwrap()[0].key, "a.md");
1095 }
1096
1097 #[test]
1098 fn search_ranks_more_relevant_documents_higher() {
1099 let dir = scratch();
1100 let mut idx = Index::open(&dir).unwrap();
1101 idx.update(&[stat("hi.md", 1), stat("lo.md", 1)], |f| {
1102 Ok(if f.key == "hi.md" {
1103 doc("Hi", "Note", &[], "alpha alpha alpha beta")
1104 } else {
1105 doc("Lo", "Note", &[], "alpha gamma delta epsilon")
1106 })
1107 })
1108 .unwrap();
1109 let hits = idx.search("alpha", 10).unwrap();
1110 assert_eq!(hits.len(), 2);
1111 assert_eq!(
1112 hits[0].key, "hi.md",
1113 "the higher term frequency should rank first"
1114 );
1115 assert!(hits[0].score > hits[1].score);
1116 }
1117
1118 #[test]
1119 fn empty_query_returns_no_hits() {
1120 let dir = scratch();
1121 let mut idx = Index::open(&dir).unwrap();
1122 idx.update(&[stat("a.md", 1)], |_| Ok(doc("A", "Note", &[], "content")))
1123 .unwrap();
1124 assert!(idx.search("", 10).unwrap().is_empty());
1125 assert!(idx.search(" ", 10).unwrap().is_empty());
1126 }
1127
1128 #[test]
1129 fn search_merges_postings_across_segments() {
1130 let dir = scratch();
1131 let mut idx = Index::open(&dir).unwrap();
1132 idx.update(&[stat("a.md", 1)], |_| {
1134 Ok(doc("A", "Note", &[], "shared onlya"))
1135 })
1136 .unwrap();
1137 idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
1138 Ok(if f.key == "b.md" {
1139 doc("B", "Note", &[], "shared onlyb")
1140 } else {
1141 doc("A", "Note", &[], "shared onlya")
1142 })
1143 })
1144 .unwrap();
1145 assert!(idx.segment_count() >= 2);
1146 let hits = idx.search("shared", 10).unwrap();
1147 let keys: BTreeSet<&str> = hits.iter().map(|h| h.key.as_str()).collect();
1148 assert!(keys.contains("a.md") && keys.contains("b.md"), "{keys:?}");
1149 }
1150}