use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::path::{Path, PathBuf};
use fst::automaton::{Levenshtein, Str};
use fst::{Automaton, IntoStreamer, Map, MapBuilder, Streamer};
use regex_automata::Anchored;
use regex_automata::dfa::Automaton as _;
use regex_automata::dfa::{StartKind, dense};
use regex_automata::util::primitives::StateID;
const MANIFEST_VERSION: u64 = 1;
const MAX_FUZZY: u32 = 2;
pub fn tokenize(text: &str) -> Vec<String> {
let mut out = Vec::new();
let mut cur = String::new();
for ch in text.chars() {
if ch.is_alphanumeric() {
cur.extend(ch.to_lowercase());
} else if !cur.is_empty() {
out.push(std::mem::take(&mut cur));
}
}
if !cur.is_empty() {
out.push(cur);
}
out
}
fn put_uvarint(buf: &mut Vec<u8>, mut val: u64) {
loop {
let mut byte = (val & 0x7f) as u8;
val >>= 7;
if val != 0 {
byte |= 0x80;
}
buf.push(byte);
if val == 0 {
break;
}
}
}
fn get_uvarint(buf: &[u8], pos: &mut usize) -> Option<u64> {
let mut val = 0u64;
let mut shift = 0u32;
loop {
let byte = *buf.get(*pos)?;
*pos += 1;
val |= u64::from(byte & 0x7f) << shift;
if byte & 0x80 == 0 {
return Some(val);
}
shift += 7;
if shift >= 64 {
return None;
}
}
}
fn encode_postings(buf: &mut Vec<u8>, postings: &[(u64, u32)]) -> u64 {
let offset = buf.len() as u64;
put_uvarint(buf, postings.len() as u64);
for &(doc, tf) in postings {
put_uvarint(buf, doc);
put_uvarint(buf, u64::from(tf));
}
offset
}
fn decode_postings(blob: &[u8], offset: u64) -> Vec<(u64, u32)> {
let mut pos = offset as usize;
let Some(n) = get_uvarint(blob, &mut pos) else {
return Vec::new();
};
let mut out = Vec::with_capacity(n as usize);
for _ in 0..n {
let (Some(doc), Some(tf)) = (get_uvarint(blob, &mut pos), get_uvarint(blob, &mut pos))
else {
break;
};
out.push((doc, tf as u32));
}
out
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FileStat {
pub key: String,
pub path: PathBuf,
pub mtime_ns: u64,
pub size: u64,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct DocSource {
pub title: String,
pub type_: String,
pub tags: Vec<String>,
pub text: String,
}
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct UpdateReport {
pub added: usize,
pub changed: usize,
pub removed: usize,
pub wrote_segment: bool,
}
impl UpdateReport {
pub fn is_empty(&self) -> bool {
self.added == 0 && self.changed == 0 && self.removed == 0
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct DocMeta {
key: String,
title: String,
type_: String,
tags: Vec<String>,
len: u32,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct FileRec {
doc: u64,
mtime_ns: u64,
size: u64,
}
#[derive(Debug, Clone, Default)]
struct Manifest {
next_doc: u64,
segments: Vec<u32>,
files: BTreeMap<String, FileRec>,
docs: BTreeMap<u64, DocMeta>,
deleted: BTreeSet<u64>,
}
impl Manifest {
fn to_json(&self) -> serde_json::Value {
let files: serde_json::Map<String, serde_json::Value> = self
.files
.iter()
.map(|(k, r)| {
(
k.clone(),
serde_json::json!({ "doc": r.doc, "mtime_ns": r.mtime_ns, "size": r.size }),
)
})
.collect();
let docs: serde_json::Map<String, serde_json::Value> = self
.docs
.iter()
.map(|(id, m)| {
(
id.to_string(),
serde_json::json!({
"key": m.key,
"title": m.title,
"type": m.type_,
"tags": m.tags,
"len": m.len,
}),
)
})
.collect();
serde_json::json!({
"version": MANIFEST_VERSION,
"next_doc": self.next_doc,
"segments": self.segments,
"files": files,
"docs": docs,
"deleted": self.deleted.iter().collect::<Vec<_>>(),
})
}
fn from_json(v: &serde_json::Value) -> Result<Manifest, String> {
let obj = v.as_object().ok_or("manifest is not an object")?;
let mut m = Manifest {
next_doc: obj.get("next_doc").and_then(|x| x.as_u64()).unwrap_or(0),
..Manifest::default()
};
if let Some(arr) = obj.get("segments").and_then(|x| x.as_array()) {
m.segments = arr
.iter()
.filter_map(|x| x.as_u64().map(|n| n as u32))
.collect();
}
if let Some(files) = obj.get("files").and_then(|x| x.as_object()) {
for (k, r) in files {
let doc = r.get("doc").and_then(|x| x.as_u64()).unwrap_or(0);
let mtime_ns = r.get("mtime_ns").and_then(|x| x.as_u64()).unwrap_or(0);
let size = r.get("size").and_then(|x| x.as_u64()).unwrap_or(0);
m.files.insert(
k.clone(),
FileRec {
doc,
mtime_ns,
size,
},
);
}
}
if let Some(docs) = obj.get("docs").and_then(|x| x.as_object()) {
for (id, d) in docs {
let Ok(id) = id.parse::<u64>() else { continue };
let tags = d
.get("tags")
.and_then(|x| x.as_array())
.map(|a| {
a.iter()
.filter_map(|t| t.as_str().map(String::from))
.collect()
})
.unwrap_or_default();
m.docs.insert(
id,
DocMeta {
key: d
.get("key")
.and_then(|x| x.as_str())
.unwrap_or("")
.to_string(),
title: d
.get("title")
.and_then(|x| x.as_str())
.unwrap_or("")
.to_string(),
type_: d
.get("type")
.and_then(|x| x.as_str())
.unwrap_or("")
.to_string(),
tags,
len: d.get("len").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
},
);
}
}
if let Some(arr) = obj.get("deleted").and_then(|x| x.as_array()) {
m.deleted = arr.iter().filter_map(|x| x.as_u64()).collect();
}
Ok(m)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum QueryTerm {
Exact(String),
Prefix(String),
Fuzzy(String, u32),
Regex(String),
}
pub fn parse_query(query: &str) -> Vec<QueryTerm> {
let mut out = Vec::new();
for tok in query.split_whitespace() {
if tok.len() >= 2 && tok.starts_with('/') && tok.ends_with('/') {
let inner = &tok[1..tok.len() - 1];
if !inner.is_empty() {
out.push(QueryTerm::Regex(inner.to_string()));
}
continue;
}
enum Op {
Exact,
Prefix,
Fuzzy(u32),
}
let (core, op) = if let Some(tilde) = tok.rfind('~') {
let dist = tok[tilde + 1..]
.parse::<u32>()
.unwrap_or(1)
.clamp(1, MAX_FUZZY);
(&tok[..tilde], Op::Fuzzy(dist))
} else if let Some(head) = tok.strip_suffix('*') {
(head, Op::Prefix)
} else {
(tok, Op::Exact)
};
let mut toks = tokenize(core);
if let Some(last) = toks.pop() {
for t in toks {
out.push(QueryTerm::Exact(t));
}
out.push(match op {
Op::Exact => QueryTerm::Exact(last),
Op::Prefix => QueryTerm::Prefix(last),
Op::Fuzzy(d) => QueryTerm::Fuzzy(last, d),
});
}
}
out
}
struct Segment {
map: Map<Vec<u8>>,
pos: Vec<u8>,
}
fn collect_matches<A: Automaton>(
segments: &[Segment],
aut: &A,
deleted: &BTreeSet<u64>,
merged: &mut HashMap<String, Vec<(u64, u32)>>,
) {
for seg in segments {
let mut stream = seg.map.search(aut).into_stream();
while let Some((key, off)) = stream.next() {
let live: Vec<(u64, u32)> = decode_postings(&seg.pos, off)
.into_iter()
.filter(|(doc, _)| !deleted.contains(doc))
.collect();
if !live.is_empty()
&& let Ok(term) = std::str::from_utf8(key)
{
merged.entry(term.to_string()).or_default().extend(live);
}
}
}
}
struct DfaAutomaton {
dfa: dense::DFA<Vec<u32>>,
start: StateID,
}
impl DfaAutomaton {
fn new(pattern: &str) -> Result<DfaAutomaton, String> {
let dfa = dense::Builder::new()
.configure(dense::Config::new().start_kind(StartKind::Anchored))
.build(pattern)
.map_err(|e| format!("invalid regex: {e}"))?;
let start = dfa
.universal_start_state(Anchored::Yes)
.ok_or("regex start depends on look-around, unsupported here")?;
Ok(DfaAutomaton { dfa, start })
}
}
impl Automaton for DfaAutomaton {
type State = StateID;
fn start(&self) -> StateID {
self.start
}
fn is_match(&self, state: &StateID) -> bool {
self.dfa.is_match_state(self.dfa.next_eoi_state(*state))
}
fn can_match(&self, state: &StateID) -> bool {
!self.dfa.is_dead_state(*state)
}
fn accept(&self, state: &StateID, byte: u8) -> StateID {
self.dfa.next_state(*state, byte)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SearchHit {
pub key: String,
pub title: String,
pub type_: String,
pub tags: Vec<String>,
pub score: f32,
}
pub struct Index {
dir: PathBuf,
manifest: Manifest,
}
impl Index {
pub fn open(dir: &Path) -> Result<Index, String> {
let manifest_path = dir.join("manifest.json");
let manifest = match std::fs::read_to_string(&manifest_path) {
Ok(text) => {
let v: serde_json::Value = serde_json::from_str(&text)
.map_err(|e| format!("{}: {e}", manifest_path.display()))?;
Manifest::from_json(&v)?
}
Err(_) => Manifest::default(),
};
Ok(Index {
dir: dir.to_path_buf(),
manifest,
})
}
pub fn doc_count(&self) -> usize {
self.manifest.docs.len()
}
pub fn segment_count(&self) -> usize {
self.manifest.segments.len()
}
pub fn tombstone_count(&self) -> usize {
self.manifest.deleted.len()
}
pub fn pending(&self, current: &[FileStat]) -> (usize, usize, usize) {
let present: BTreeSet<&str> = current.iter().map(|f| f.key.as_str()).collect();
let removed = self
.manifest
.files
.keys()
.filter(|k| !present.contains(k.as_str()))
.count();
let (mut added, mut changed) = (0, 0);
for f in current {
match self.manifest.files.get(&f.key) {
None => added += 1,
Some(r) if r.mtime_ns != f.mtime_ns || r.size != f.size => changed += 1,
_ => {}
}
}
(added, changed, removed)
}
fn seg_paths(&self, num: u32) -> (PathBuf, PathBuf) {
let base = format!("seg-{num:05}");
(
self.dir.join(format!("{base}.fst")),
self.dir.join(format!("{base}.pos")),
)
}
fn load_segment(&self, num: u32) -> Result<Segment, String> {
let (fst_path, pos_path) = self.seg_paths(num);
let fst_bytes =
std::fs::read(&fst_path).map_err(|e| format!("{}: {e}", fst_path.display()))?;
let pos = std::fs::read(&pos_path).map_err(|e| format!("{}: {e}", pos_path.display()))?;
let map = Map::new(fst_bytes).map_err(|e| format!("{}: {e}", fst_path.display()))?;
Ok(Segment { map, pos })
}
pub fn update<F>(&mut self, current: &[FileStat], load: F) -> Result<UpdateReport, String>
where
F: Fn(&FileStat) -> Result<DocSource, String>,
{
let mut report = UpdateReport::default();
let present: BTreeSet<&str> = current.iter().map(|f| f.key.as_str()).collect();
let removed_keys: Vec<String> = self
.manifest
.files
.keys()
.filter(|k| !present.contains(k.as_str()))
.cloned()
.collect();
for key in removed_keys {
if let Some(rec) = self.manifest.files.remove(&key) {
self.manifest.docs.remove(&rec.doc);
self.manifest.deleted.insert(rec.doc);
report.removed += 1;
}
}
let mut postings: BTreeMap<String, BTreeMap<u64, u32>> = BTreeMap::new();
for f in current {
let unchanged = self
.manifest
.files
.get(&f.key)
.is_some_and(|r| r.mtime_ns == f.mtime_ns && r.size == f.size);
if unchanged {
continue;
}
let is_change = self.manifest.files.contains_key(&f.key);
if let Some(old) = self.manifest.files.get(&f.key) {
self.manifest.docs.remove(&old.doc);
self.manifest.deleted.insert(old.doc);
}
let src = load(f)?;
let doc = self.manifest.next_doc;
self.manifest.next_doc += 1;
let terms = tokenize(&format!(
"{} {} {} {}",
src.title,
src.type_,
src.tags.join(" "),
src.text
));
let len = terms.len() as u32;
for term in terms {
*postings.entry(term).or_default().entry(doc).or_insert(0) += 1;
}
self.manifest.docs.insert(
doc,
DocMeta {
key: f.key.clone(),
title: src.title,
type_: src.type_,
tags: src.tags,
len,
},
);
self.manifest.files.insert(
f.key.clone(),
FileRec {
doc,
mtime_ns: f.mtime_ns,
size: f.size,
},
);
if is_change {
report.changed += 1;
} else {
report.added += 1;
}
}
if !postings.is_empty() {
let num = self
.manifest
.segments
.iter()
.copied()
.max()
.map(|n| n + 1)
.unwrap_or(0);
self.write_segment(num, &postings)?;
self.manifest.segments.push(num);
report.wrote_segment = true;
}
Ok(report)
}
fn write_segment(
&self,
num: u32,
postings: &BTreeMap<String, BTreeMap<u64, u32>>,
) -> Result<(), String> {
std::fs::create_dir_all(&self.dir).map_err(|e| format!("{}: {e}", self.dir.display()))?;
let (fst_path, pos_path) = self.seg_paths(num);
let mut pos_blob = Vec::new();
let wtr = std::io::BufWriter::new(
std::fs::File::create(&fst_path).map_err(|e| format!("{}: {e}", fst_path.display()))?,
);
let mut builder = MapBuilder::new(wtr).map_err(|e| e.to_string())?;
for (term, docs) in postings {
let list: Vec<(u64, u32)> = docs.iter().map(|(&d, &tf)| (d, tf)).collect();
let off = encode_postings(&mut pos_blob, &list);
builder
.insert(term.as_bytes(), off)
.map_err(|e| e.to_string())?;
}
builder.finish().map_err(|e| e.to_string())?;
std::fs::write(&pos_path, &pos_blob).map_err(|e| format!("{}: {e}", pos_path.display()))?;
Ok(())
}
pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>, String> {
let terms = parse_query(query);
if terms.is_empty() {
return Ok(Vec::new());
}
let segments: Vec<Segment> = self
.manifest
.segments
.iter()
.map(|&n| self.load_segment(n))
.collect::<Result<_, _>>()?;
let n_docs = self.manifest.docs.len().max(1) as f32;
let deleted = &self.manifest.deleted;
let mut scores: HashMap<u64, f32> = HashMap::new();
for qt in &terms {
let mut merged: HashMap<String, Vec<(u64, u32)>> = HashMap::new();
match qt {
QueryTerm::Exact(t) => {
collect_matches(&segments, &Str::new(t), deleted, &mut merged)
}
QueryTerm::Prefix(t) => {
collect_matches(&segments, &Str::new(t).starts_with(), deleted, &mut merged)
}
QueryTerm::Fuzzy(t, dist) => {
if let Ok(lev) = Levenshtein::new(t, *dist) {
collect_matches(&segments, &lev, deleted, &mut merged);
}
}
QueryTerm::Regex(pat) => {
if let Ok(dfa) = DfaAutomaton::new(pat) {
collect_matches(&segments, &dfa, deleted, &mut merged);
}
}
}
for list in merged.values() {
let df = list.len().max(1) as f32;
let idf = (1.0 + n_docs / df).ln();
for &(doc, tf) in list {
*scores.entry(doc).or_insert(0.0) += idf * (1.0 + (tf as f32).ln());
}
}
}
let mut hits: Vec<SearchHit> = scores
.into_iter()
.filter_map(|(doc, score)| {
self.manifest.docs.get(&doc).map(|m| SearchHit {
key: m.key.clone(),
title: m.title.clone(),
type_: m.type_.clone(),
tags: m.tags.clone(),
score,
})
})
.collect();
hits.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.key.cmp(&b.key))
});
hits.truncate(limit);
Ok(hits)
}
pub fn condense(&mut self) -> Result<bool, String> {
if self.manifest.segments.len() <= 1 && self.manifest.deleted.is_empty() {
return Ok(false);
}
let old_segments = self.manifest.segments.clone();
let segments: Vec<Segment> = old_segments
.iter()
.map(|&n| self.load_segment(n))
.collect::<Result<_, _>>()?;
let mut postings: BTreeMap<String, BTreeMap<u64, u32>> = BTreeMap::new();
for seg in &segments {
let mut s = seg.map.stream();
while let Some((term_bytes, off)) = s.next() {
let Ok(term) = std::str::from_utf8(term_bytes) else {
continue;
};
for (doc, tf) in decode_postings(&seg.pos, off) {
if self.manifest.deleted.contains(&doc) {
continue;
}
*postings
.entry(term.to_string())
.or_default()
.entry(doc)
.or_insert(0) += tf;
}
}
}
let num = old_segments.iter().copied().max().unwrap_or(0) + 1;
if !postings.is_empty() {
self.write_segment(num, &postings)?;
self.manifest.segments = vec![num];
} else {
self.manifest.segments.clear();
}
self.manifest.deleted.clear();
for old in old_segments {
let (fst_path, pos_path) = self.seg_paths(old);
let _ = std::fs::remove_file(fst_path);
let _ = std::fs::remove_file(pos_path);
}
Ok(true)
}
pub fn reset(&mut self) {
for num in std::mem::take(&mut self.manifest.segments) {
let (fst_path, pos_path) = self.seg_paths(num);
let _ = std::fs::remove_file(fst_path);
let _ = std::fs::remove_file(pos_path);
}
self.manifest = Manifest::default();
}
pub fn save(&self) -> Result<(), String> {
std::fs::create_dir_all(&self.dir).map_err(|e| format!("{}: {e}", self.dir.display()))?;
let path = self.dir.join("manifest.json");
let text =
serde_json::to_string_pretty(&self.manifest.to_json()).map_err(|e| e.to_string())?;
std::fs::write(&path, text).map_err(|e| format!("{}: {e}", path.display()))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::atomic::{AtomicU32, Ordering};
static TAG: AtomicU32 = AtomicU32::new(0);
fn scratch() -> PathBuf {
let n = TAG.fetch_add(1, Ordering::Relaxed);
let dir = std::env::temp_dir().join(format!("ct-okfindex-{}-{n}", std::process::id()));
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
dir
}
fn stat(key: &str, mtime_ns: u64) -> FileStat {
FileStat {
key: key.to_string(),
path: PathBuf::from(key),
mtime_ns,
size: mtime_ns,
}
}
fn doc(title: &str, type_: &str, tags: &[&str], text: &str) -> DocSource {
DocSource {
title: title.to_string(),
type_: type_.to_string(),
tags: tags.iter().map(|s| s.to_string()).collect(),
text: text.to_string(),
}
}
#[test]
fn tokenize_lowercases_and_splits_on_nonalnum() {
assert_eq!(
tokenize("Hello, World! foo_bar"),
["hello", "world", "foo", "bar"]
);
assert_eq!(tokenize(" "), Vec::<String>::new());
}
#[test]
fn parse_query_recognizes_all_modes() {
let q = parse_query("plain data* typo~ deep~2 /sch.*ma/");
assert_eq!(q[0], QueryTerm::Exact("plain".into()));
assert_eq!(q[1], QueryTerm::Prefix("data".into()));
assert_eq!(q[2], QueryTerm::Fuzzy("typo".into(), 1));
assert_eq!(q[3], QueryTerm::Fuzzy("deep".into(), 2));
assert_eq!(q[4], QueryTerm::Regex("sch.*ma".into()));
}
#[test]
fn varint_roundtrips() {
for v in [0u64, 1, 127, 128, 300, 16384, u32::MAX as u64, u64::MAX] {
let mut buf = Vec::new();
put_uvarint(&mut buf, v);
let mut pos = 0;
assert_eq!(get_uvarint(&buf, &mut pos), Some(v));
assert_eq!(pos, buf.len());
}
}
#[test]
fn index_search_exact_prefix_and_fuzzy() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
let files = [stat("a.md", 1), stat("b.md", 1)];
idx.update(&files, |f| {
Ok(match f.key.as_str() {
"a.md" => doc(
"Customers",
"BigQuery Table",
&["pii"],
"the customers dimension table",
),
_ => doc(
"Orders",
"BigQuery Table",
&["core"],
"the orders fact table schema",
),
})
})
.unwrap();
idx.save().unwrap();
let hits = idx.search("customers", 10).unwrap();
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].key, "a.md");
let hits = idx.search("custom*", 10).unwrap();
assert_eq!(
hits.iter().map(|h| h.key.as_str()).collect::<Vec<_>>(),
["a.md"]
);
let hits = idx.search("ordrs~", 10).unwrap();
assert_eq!(hits[0].key, "b.md");
let idx2 = Index::open(&dir).unwrap();
assert_eq!(idx2.doc_count(), 2);
assert_eq!(idx2.search("schema", 10).unwrap()[0].key, "b.md");
}
#[test]
fn regex_mode_matches_substrings_via_the_dfa_adapter() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
Ok(match f.key.as_str() {
"a.md" => doc("Orders", "Table", &[], "the orders fact table"),
_ => doc("Customers", "Table", &[], "the customers dimension"),
})
})
.unwrap();
let hits = idx.search("/.*omer.*/", 10).unwrap();
assert_eq!(
hits.iter().map(|h| h.key.as_str()).collect::<Vec<_>>(),
["b.md"]
);
let hits = idx.search("/orders/", 10).unwrap();
assert_eq!(hits[0].key, "a.md");
assert!(idx.search("/zzz.*/", 10).unwrap().is_empty());
}
#[test]
fn incremental_update_supersedes_changed_and_drops_removed() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
Ok(if f.key == "a.md" {
doc("Alpha", "Note", &[], "alpha content markalpha")
} else {
doc("Beta", "Note", &[], "beta content markbeta")
})
})
.unwrap();
let report = idx
.update(&[stat("a.md", 2)], |_| {
Ok(doc("Alpha", "Note", &[], "rewritten markgamma"))
})
.unwrap();
assert_eq!((report.changed, report.removed, report.added), (1, 1, 0));
assert_eq!(idx.doc_count(), 1);
assert_eq!(idx.segment_count(), 2); assert_eq!(idx.tombstone_count(), 2);
assert!(idx.search("markalpha", 10).unwrap().is_empty());
assert!(idx.search("markbeta", 10).unwrap().is_empty());
assert_eq!(idx.search("markgamma", 10).unwrap()[0].key, "a.md");
let report = idx
.update(&[stat("a.md", 2)], |_| {
panic!("should not load unchanged file")
})
.unwrap();
assert!(report.is_empty());
assert_eq!(idx.segment_count(), 2);
}
#[test]
fn condense_merges_segments_and_drops_tombstones() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1)], |_| {
Ok(doc("A", "Note", &[], "first markone"))
})
.unwrap();
idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
Ok(if f.key == "b.md" {
doc("B", "Note", &[], "second marktwo")
} else {
doc("A", "Note", &[], "first markone")
})
})
.unwrap();
idx.update(&[stat("a.md", 9), stat("b.md", 1)], |f| {
Ok(if f.key == "a.md" {
doc("A", "Note", &[], "third markthree")
} else {
doc("B", "Note", &[], "second marktwo")
})
})
.unwrap();
assert!(idx.segment_count() >= 2);
assert!(idx.tombstone_count() >= 1);
assert!(idx.condense().unwrap());
assert_eq!(idx.segment_count(), 1);
assert_eq!(idx.tombstone_count(), 0);
assert_eq!(idx.doc_count(), 2);
assert_eq!(idx.search("markthree", 10).unwrap()[0].key, "a.md");
assert_eq!(idx.search("marktwo", 10).unwrap()[0].key, "b.md");
assert!(idx.search("markone", 10).unwrap().is_empty());
let leftover = std::fs::read_dir(&dir)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().is_some_and(|x| x == "fst"))
.count();
assert_eq!(leftover, 1);
}
#[test]
fn pending_counts_added_changed_removed_without_mutating() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
Ok(doc(&f.key, "Note", &[], "content"))
})
.unwrap();
assert_eq!(
idx.pending(&[stat("a.md", 1), stat("b.md", 2), stat("c.md", 1)]),
(1, 1, 0)
);
assert_eq!(idx.pending(&[stat("a.md", 1)]), (0, 0, 1));
assert_eq!(idx.doc_count(), 2);
}
#[test]
fn reset_clears_then_reindexes_from_scratch() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1)], |_| {
Ok(doc("A", "Note", &[], "unique markx"))
})
.unwrap();
idx.save().unwrap();
assert_eq!(idx.doc_count(), 1);
idx.reset();
assert_eq!((idx.doc_count(), idx.segment_count()), (0, 0));
assert!(idx.search("markx", 10).unwrap().is_empty());
idx.update(&[stat("a.md", 1)], |_| {
Ok(doc("A", "Note", &[], "unique marky"))
})
.unwrap();
assert_eq!(idx.search("marky", 10).unwrap()[0].key, "a.md");
}
#[test]
fn search_ranks_more_relevant_documents_higher() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("hi.md", 1), stat("lo.md", 1)], |f| {
Ok(if f.key == "hi.md" {
doc("Hi", "Note", &[], "alpha alpha alpha beta")
} else {
doc("Lo", "Note", &[], "alpha gamma delta epsilon")
})
})
.unwrap();
let hits = idx.search("alpha", 10).unwrap();
assert_eq!(hits.len(), 2);
assert_eq!(
hits[0].key, "hi.md",
"the higher term frequency should rank first"
);
assert!(hits[0].score > hits[1].score);
}
#[test]
fn empty_query_returns_no_hits() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1)], |_| Ok(doc("A", "Note", &[], "content")))
.unwrap();
assert!(idx.search("", 10).unwrap().is_empty());
assert!(idx.search(" ", 10).unwrap().is_empty());
}
#[test]
fn search_merges_postings_across_segments() {
let dir = scratch();
let mut idx = Index::open(&dir).unwrap();
idx.update(&[stat("a.md", 1)], |_| {
Ok(doc("A", "Note", &[], "shared onlya"))
})
.unwrap();
idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
Ok(if f.key == "b.md" {
doc("B", "Note", &[], "shared onlyb")
} else {
doc("A", "Note", &[], "shared onlya")
})
})
.unwrap();
assert!(idx.segment_count() >= 2);
let hits = idx.search("shared", 10).unwrap();
let keys: BTreeSet<&str> = hits.iter().map(|h| h.key.as_str()).collect();
assert!(keys.contains("a.md") && keys.contains("b.md"), "{keys:?}");
}
}