coding_tools/
okfindex.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 Jonathan Shook
3
4//! A lazily-maintained full-text index over OKF concept files, built on
5//! BurntSushi's [`fst`] crate — the same immutable finite-state-transducer
6//! machinery search engines like tantivy use for their term dictionaries.
7//!
8//! # Why this shape
9//!
10//! The index is a set of **immutable segments**. Each incremental update writes
11//! one *new* segment for the batch of changed files and never rewrites the
12//! existing ones — so "layering in new content" costs only the new segment,
13//! exactly the property the suite wanted. Superseded or removed documents are
14//! recorded as **tombstones** in the manifest and filtered at query time;
15//! [`Index::condense`] later merges every segment into one and drops the
16//! tombstones, reclaiming space without re-reading the source files.
17//!
18//! A segment is two files: `seg-NNNNN.fst`, an [`fst::Map`] from **term** to a
19//! byte offset, and `seg-NNNNN.pos`, the postings blob those offsets point into
20//! (a varint-encoded `(doc_id, term_frequency)` list per term). Global state —
21//! the next document id, the live segment list, the per-file `(doc, mtime,
22//! size)` records that drive staleness, per-document metadata, and the tombstone
23//! set — lives in `manifest.json`. Segment bytes are read into memory on demand
24//! rather than memory-mapped, keeping the dependency surface to just `fst`.
25//!
26//! # Query modes
27//!
28//! [`Index::search`] understands a small query grammar (see [`QueryTerm`]):
29//! plain terms (exact), `term*` (prefix), `term~`/`term~2` (Levenshtein fuzzy),
30//! and `/regex/`. Exact, prefix, and fuzzy are native fst automata; the regex
31//! mode drives a `regex-automata` dense DFA *as* an [`fst::Automaton`] (the
32//! modern equivalent of the `transducer` feature dropped from regex-automata
33//! 0.4), so it prunes the term FST during traversal instead of scanning it.
34
35use std::collections::{BTreeMap, BTreeSet, HashMap};
36use std::path::{Path, PathBuf};
37
38use fst::automaton::{Levenshtein, Str};
39use fst::{Automaton, IntoStreamer, Map, MapBuilder, Streamer};
40// Drives a regex-automata 0.4 dense DFA as an `fst::Automaton` (the modern
41// equivalent of the `transducer` feature dropped from regex-automata 0.4).
42use regex_automata::Anchored;
43use regex_automata::dfa::Automaton as _;
44use regex_automata::dfa::{StartKind, dense};
45use regex_automata::util::primitives::StateID;
46
47/// Manifest format version, bumped on any incompatible on-disk change.
48const MANIFEST_VERSION: u64 = 1;
49
50/// Maximum edit distance honoured for a `~N` fuzzy query (Levenshtein automata
51/// grow quickly with distance; 2 is the usual practical ceiling).
52const MAX_FUZZY: u32 = 2;
53
54// ----- Tokenization -------------------------------------------------------------------
55
56/// Split `text` into lowercased alphanumeric terms — the shared tokenizer for
57/// both indexing and (per-token) querying. Deliberately minimal: Unicode
58/// alphanumeric runs, lowercased, no stemming or stop-words, so behaviour is
59/// predictable and dependency-free.
60pub fn tokenize(text: &str) -> Vec<String> {
61    let mut out = Vec::new();
62    let mut cur = String::new();
63    for ch in text.chars() {
64        if ch.is_alphanumeric() {
65            cur.extend(ch.to_lowercase());
66        } else if !cur.is_empty() {
67            out.push(std::mem::take(&mut cur));
68        }
69    }
70    if !cur.is_empty() {
71        out.push(cur);
72    }
73    out
74}
75
76// ----- Varint postings ----------------------------------------------------------------
77
78/// Append `val` to `buf` as an unsigned LEB128 varint.
79fn put_uvarint(buf: &mut Vec<u8>, mut val: u64) {
80    loop {
81        let mut byte = (val & 0x7f) as u8;
82        val >>= 7;
83        if val != 0 {
84            byte |= 0x80;
85        }
86        buf.push(byte);
87        if val == 0 {
88            break;
89        }
90    }
91}
92
93/// Read an unsigned LEB128 varint from `buf` at `*pos`, advancing `*pos`.
94fn get_uvarint(buf: &[u8], pos: &mut usize) -> Option<u64> {
95    let mut val = 0u64;
96    let mut shift = 0u32;
97    loop {
98        let byte = *buf.get(*pos)?;
99        *pos += 1;
100        val |= u64::from(byte & 0x7f) << shift;
101        if byte & 0x80 == 0 {
102            return Some(val);
103        }
104        shift += 7;
105        if shift >= 64 {
106            return None;
107        }
108    }
109}
110
111/// Encode one term's postings (a `(doc_id, term_frequency)` list, doc ascending)
112/// into `buf`, returning the byte offset the list begins at.
113fn encode_postings(buf: &mut Vec<u8>, postings: &[(u64, u32)]) -> u64 {
114    let offset = buf.len() as u64;
115    put_uvarint(buf, postings.len() as u64);
116    for &(doc, tf) in postings {
117        put_uvarint(buf, doc);
118        put_uvarint(buf, u64::from(tf));
119    }
120    offset
121}
122
123/// Decode the postings list stored at `offset` in a segment's `.pos` blob.
124fn decode_postings(blob: &[u8], offset: u64) -> Vec<(u64, u32)> {
125    let mut pos = offset as usize;
126    let Some(n) = get_uvarint(blob, &mut pos) else {
127        return Vec::new();
128    };
129    let mut out = Vec::with_capacity(n as usize);
130    for _ in 0..n {
131        let (Some(doc), Some(tf)) = (get_uvarint(blob, &mut pos), get_uvarint(blob, &mut pos))
132        else {
133            break;
134        };
135        out.push((doc, tf as u32));
136    }
137    out
138}
139
140// ----- The document feed --------------------------------------------------------------
141
142/// A file's identity for staleness: its path-relative key plus the `mtime`/size
143/// the indexer compares against the manifest to decide what to re-index.
144#[derive(Debug, Clone, PartialEq, Eq)]
145pub struct FileStat {
146    /// Stable key for the file (e.g. project-relative path with `/` separators).
147    pub key: String,
148    /// Absolute path, for the loader to read when (re)indexing.
149    pub path: PathBuf,
150    /// Last-modified time in nanoseconds since the Unix epoch.
151    pub mtime_ns: u64,
152    /// File size in bytes.
153    pub size: u64,
154}
155
156/// A document ready to index: its searchable text plus the metadata carried into
157/// search results.
158#[derive(Debug, Clone, Default, PartialEq, Eq)]
159pub struct DocSource {
160    /// Human title (frontmatter `title`, else the file stem).
161    pub title: String,
162    /// OKF `type`.
163    pub type_: String,
164    /// Tags.
165    pub tags: Vec<String>,
166    /// The full searchable text (frontmatter fields + body).
167    pub text: String,
168}
169
170/// What an [`Index::update`] changed.
171#[derive(Debug, Default, Clone, PartialEq, Eq)]
172pub struct UpdateReport {
173    pub added: usize,
174    pub changed: usize,
175    pub removed: usize,
176    /// Whether a new segment was written (true iff added+changed > 0).
177    pub wrote_segment: bool,
178}
179
180impl UpdateReport {
181    /// Whether anything at all changed (so the manifest needs saving).
182    pub fn is_empty(&self) -> bool {
183        self.added == 0 && self.changed == 0 && self.removed == 0
184    }
185}
186
187// ----- Manifest -----------------------------------------------------------------------
188
189/// Per-document metadata, surfaced in search results and used for scoring.
190#[derive(Debug, Clone, PartialEq, Eq)]
191struct DocMeta {
192    key: String,
193    title: String,
194    type_: String,
195    tags: Vec<String>,
196    /// Total term count (document length), for length-aware scoring later.
197    len: u32,
198}
199
200/// Per-file staleness record.
201#[derive(Debug, Clone, PartialEq, Eq)]
202struct FileRec {
203    doc: u64,
204    mtime_ns: u64,
205    size: u64,
206}
207
208/// The index's global state, persisted as `manifest.json`.
209#[derive(Debug, Clone, Default)]
210struct Manifest {
211    next_doc: u64,
212    segments: Vec<u32>,
213    files: BTreeMap<String, FileRec>,
214    docs: BTreeMap<u64, DocMeta>,
215    deleted: BTreeSet<u64>,
216}
217
218impl Manifest {
219    fn to_json(&self) -> serde_json::Value {
220        let files: serde_json::Map<String, serde_json::Value> = self
221            .files
222            .iter()
223            .map(|(k, r)| {
224                (
225                    k.clone(),
226                    serde_json::json!({ "doc": r.doc, "mtime_ns": r.mtime_ns, "size": r.size }),
227                )
228            })
229            .collect();
230        let docs: serde_json::Map<String, serde_json::Value> = self
231            .docs
232            .iter()
233            .map(|(id, m)| {
234                (
235                    id.to_string(),
236                    serde_json::json!({
237                        "key": m.key,
238                        "title": m.title,
239                        "type": m.type_,
240                        "tags": m.tags,
241                        "len": m.len,
242                    }),
243                )
244            })
245            .collect();
246        serde_json::json!({
247            "version": MANIFEST_VERSION,
248            "next_doc": self.next_doc,
249            "segments": self.segments,
250            "files": files,
251            "docs": docs,
252            "deleted": self.deleted.iter().collect::<Vec<_>>(),
253        })
254    }
255
256    fn from_json(v: &serde_json::Value) -> Result<Manifest, String> {
257        let obj = v.as_object().ok_or("manifest is not an object")?;
258        let mut m = Manifest {
259            next_doc: obj.get("next_doc").and_then(|x| x.as_u64()).unwrap_or(0),
260            ..Manifest::default()
261        };
262        if let Some(arr) = obj.get("segments").and_then(|x| x.as_array()) {
263            m.segments = arr
264                .iter()
265                .filter_map(|x| x.as_u64().map(|n| n as u32))
266                .collect();
267        }
268        if let Some(files) = obj.get("files").and_then(|x| x.as_object()) {
269            for (k, r) in files {
270                let doc = r.get("doc").and_then(|x| x.as_u64()).unwrap_or(0);
271                let mtime_ns = r.get("mtime_ns").and_then(|x| x.as_u64()).unwrap_or(0);
272                let size = r.get("size").and_then(|x| x.as_u64()).unwrap_or(0);
273                m.files.insert(
274                    k.clone(),
275                    FileRec {
276                        doc,
277                        mtime_ns,
278                        size,
279                    },
280                );
281            }
282        }
283        if let Some(docs) = obj.get("docs").and_then(|x| x.as_object()) {
284            for (id, d) in docs {
285                let Ok(id) = id.parse::<u64>() else { continue };
286                let tags = d
287                    .get("tags")
288                    .and_then(|x| x.as_array())
289                    .map(|a| {
290                        a.iter()
291                            .filter_map(|t| t.as_str().map(String::from))
292                            .collect()
293                    })
294                    .unwrap_or_default();
295                m.docs.insert(
296                    id,
297                    DocMeta {
298                        key: d
299                            .get("key")
300                            .and_then(|x| x.as_str())
301                            .unwrap_or("")
302                            .to_string(),
303                        title: d
304                            .get("title")
305                            .and_then(|x| x.as_str())
306                            .unwrap_or("")
307                            .to_string(),
308                        type_: d
309                            .get("type")
310                            .and_then(|x| x.as_str())
311                            .unwrap_or("")
312                            .to_string(),
313                        tags,
314                        len: d.get("len").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
315                    },
316                );
317            }
318        }
319        if let Some(arr) = obj.get("deleted").and_then(|x| x.as_array()) {
320            m.deleted = arr.iter().filter_map(|x| x.as_u64()).collect();
321        }
322        Ok(m)
323    }
324}
325
326// ----- Query grammar ------------------------------------------------------------------
327
328/// One parsed query token and how it should match the term dictionary.
329#[derive(Debug, Clone, PartialEq, Eq)]
330pub enum QueryTerm {
331    /// Exact term match.
332    Exact(String),
333    /// Prefix match (`term*`).
334    Prefix(String),
335    /// Levenshtein fuzzy match within `dist` edits (`term~` / `term~N`).
336    Fuzzy(String, u32),
337    /// Regex over the term dictionary (`/pattern/`) — a slower full-term scan.
338    Regex(String),
339}
340
341/// Parse a query string into its [`QueryTerm`]s. Whitespace separates tokens;
342/// `/.../` is one regex token. The non-regex modes share the index's
343/// [`tokenize`] so a query splits exactly the way the stored terms did: a token
344/// that tokenizes to several terms contributes leading [`Exact`](QueryTerm::Exact)
345/// terms, and any `*`/`~` operator applies to the final term (the typical token
346/// is a single term, so this is just `Prefix`/`Fuzzy`). Empty/operator-only
347/// tokens are dropped.
348pub fn parse_query(query: &str) -> Vec<QueryTerm> {
349    let mut out = Vec::new();
350    for tok in query.split_whitespace() {
351        // /regex/ — the pattern matches whole dictionary terms verbatim.
352        if tok.len() >= 2 && tok.starts_with('/') && tok.ends_with('/') {
353            let inner = &tok[1..tok.len() - 1];
354            if !inner.is_empty() {
355                out.push(QueryTerm::Regex(inner.to_string()));
356            }
357            continue;
358        }
359        // Split off a trailing operator, then tokenize the core the same way the
360        // index did, applying the operator to the final term.
361        enum Op {
362            Exact,
363            Prefix,
364            Fuzzy(u32),
365        }
366        let (core, op) = if let Some(tilde) = tok.rfind('~') {
367            let dist = tok[tilde + 1..]
368                .parse::<u32>()
369                .unwrap_or(1)
370                .clamp(1, MAX_FUZZY);
371            (&tok[..tilde], Op::Fuzzy(dist))
372        } else if let Some(head) = tok.strip_suffix('*') {
373            (head, Op::Prefix)
374        } else {
375            (tok, Op::Exact)
376        };
377        let mut toks = tokenize(core);
378        if let Some(last) = toks.pop() {
379            for t in toks {
380                out.push(QueryTerm::Exact(t));
381            }
382            out.push(match op {
383                Op::Exact => QueryTerm::Exact(last),
384                Op::Prefix => QueryTerm::Prefix(last),
385                Op::Fuzzy(d) => QueryTerm::Fuzzy(last, d),
386            });
387        }
388    }
389    out
390}
391
392// ----- A loaded segment ---------------------------------------------------------------
393
394/// One immutable segment held in memory for querying.
395struct Segment {
396    map: Map<Vec<u8>>,
397    pos: Vec<u8>,
398}
399
400/// Accumulate, into `merged`, the `(dictionary term -> live postings)` of every
401/// segment key accepted by `aut`, dropping tombstoned documents. Generic over
402/// the automaton so the prefix/fuzzy/exact/regex modes share one walk.
403fn collect_matches<A: Automaton>(
404    segments: &[Segment],
405    aut: &A,
406    deleted: &BTreeSet<u64>,
407    merged: &mut HashMap<String, Vec<(u64, u32)>>,
408) {
409    for seg in segments {
410        let mut stream = seg.map.search(aut).into_stream();
411        while let Some((key, off)) = stream.next() {
412            let live: Vec<(u64, u32)> = decode_postings(&seg.pos, off)
413                .into_iter()
414                .filter(|(doc, _)| !deleted.contains(doc))
415                .collect();
416            if !live.is_empty()
417                && let Ok(term) = std::str::from_utf8(key)
418            {
419                merged.entry(term.to_string()).or_default().extend(live);
420            }
421        }
422    }
423}
424
425/// An [`fst::Automaton`] backed by a regex-automata dense DFA, so `/regex/`
426/// queries prune the term FST during traversal instead of scanning every term.
427/// The DFA is compiled **anchored**, because fst feeds a key's bytes from the
428/// start and a match means the whole key matched.
429struct DfaAutomaton {
430    dfa: dense::DFA<Vec<u32>>,
431    start: StateID,
432}
433
434impl DfaAutomaton {
435    fn new(pattern: &str) -> Result<DfaAutomaton, String> {
436        let dfa = dense::Builder::new()
437            .configure(dense::Config::new().start_kind(StartKind::Anchored))
438            .build(pattern)
439            .map_err(|e| format!("invalid regex: {e}"))?;
440        let start = dfa
441            .universal_start_state(Anchored::Yes)
442            .ok_or("regex start depends on look-around, unsupported here")?;
443        Ok(DfaAutomaton { dfa, start })
444    }
445}
446
447impl Automaton for DfaAutomaton {
448    type State = StateID;
449
450    fn start(&self) -> StateID {
451        self.start
452    }
453
454    fn is_match(&self, state: &StateID) -> bool {
455        // A DFA reports a whole-input match only after the end-of-input step.
456        self.dfa.is_match_state(self.dfa.next_eoi_state(*state))
457    }
458
459    fn can_match(&self, state: &StateID) -> bool {
460        // A dead state can never reach a match, so fst can prune this branch.
461        !self.dfa.is_dead_state(*state)
462    }
463
464    fn accept(&self, state: &StateID, byte: u8) -> StateID {
465        self.dfa.next_state(*state, byte)
466    }
467}
468
469// ----- The index ----------------------------------------------------------------------
470
471/// One search result.
472#[derive(Debug, Clone, PartialEq)]
473pub struct SearchHit {
474    /// The document's stable key (project-relative path).
475    pub key: String,
476    pub title: String,
477    pub type_: String,
478    pub tags: Vec<String>,
479    pub score: f32,
480}
481
482/// A lazily-maintained fst-segment index rooted at a directory (`.ct/okf/`).
483pub struct Index {
484    dir: PathBuf,
485    manifest: Manifest,
486}
487
488impl Index {
489    /// Open the index at `dir`, loading `manifest.json` if present (an absent or
490    /// unreadable manifest yields an empty index). The directory is created on
491    /// the first [`Index::save`].
492    pub fn open(dir: &Path) -> Result<Index, String> {
493        let manifest_path = dir.join("manifest.json");
494        let manifest = match std::fs::read_to_string(&manifest_path) {
495            Ok(text) => {
496                let v: serde_json::Value = serde_json::from_str(&text)
497                    .map_err(|e| format!("{}: {e}", manifest_path.display()))?;
498                Manifest::from_json(&v)?
499            }
500            Err(_) => Manifest::default(),
501        };
502        Ok(Index {
503            dir: dir.to_path_buf(),
504            manifest,
505        })
506    }
507
508    /// Number of live (non-tombstoned) documents.
509    pub fn doc_count(&self) -> usize {
510        self.manifest.docs.len()
511    }
512
513    /// Number of live segments.
514    pub fn segment_count(&self) -> usize {
515        self.manifest.segments.len()
516    }
517
518    /// Number of tombstoned documents awaiting [`condense`](Index::condense).
519    pub fn tombstone_count(&self) -> usize {
520        self.manifest.deleted.len()
521    }
522
523    /// How many of `current` are new / changed / removed relative to the
524    /// manifest — a read-only staleness probe for `index status` that mutates
525    /// nothing (the same diff [`update`](Index::update) would act on).
526    pub fn pending(&self, current: &[FileStat]) -> (usize, usize, usize) {
527        let present: BTreeSet<&str> = current.iter().map(|f| f.key.as_str()).collect();
528        let removed = self
529            .manifest
530            .files
531            .keys()
532            .filter(|k| !present.contains(k.as_str()))
533            .count();
534        let (mut added, mut changed) = (0, 0);
535        for f in current {
536            match self.manifest.files.get(&f.key) {
537                None => added += 1,
538                Some(r) if r.mtime_ns != f.mtime_ns || r.size != f.size => changed += 1,
539                _ => {}
540            }
541        }
542        (added, changed, removed)
543    }
544
545    fn seg_paths(&self, num: u32) -> (PathBuf, PathBuf) {
546        let base = format!("seg-{num:05}");
547        (
548            self.dir.join(format!("{base}.fst")),
549            self.dir.join(format!("{base}.pos")),
550        )
551    }
552
553    fn load_segment(&self, num: u32) -> Result<Segment, String> {
554        let (fst_path, pos_path) = self.seg_paths(num);
555        let fst_bytes =
556            std::fs::read(&fst_path).map_err(|e| format!("{}: {e}", fst_path.display()))?;
557        let pos = std::fs::read(&pos_path).map_err(|e| format!("{}: {e}", pos_path.display()))?;
558        let map = Map::new(fst_bytes).map_err(|e| format!("{}: {e}", fst_path.display()))?;
559        Ok(Segment { map, pos })
560    }
561
562    /// Reconcile the index against `current` (the live files of the OKF roots),
563    /// re-indexing only what changed. `load` is called to read a file's
564    /// [`DocSource`] only when it is new or modified. Writes at most one new
565    /// segment. Call [`save`](Index::save) afterwards to persist the manifest.
566    pub fn update<F>(&mut self, current: &[FileStat], load: F) -> Result<UpdateReport, String>
567    where
568        F: Fn(&FileStat) -> Result<DocSource, String>,
569    {
570        let mut report = UpdateReport::default();
571        let present: BTreeSet<&str> = current.iter().map(|f| f.key.as_str()).collect();
572
573        // Removals: files in the manifest that are gone from the roots.
574        let removed_keys: Vec<String> = self
575            .manifest
576            .files
577            .keys()
578            .filter(|k| !present.contains(k.as_str()))
579            .cloned()
580            .collect();
581        for key in removed_keys {
582            if let Some(rec) = self.manifest.files.remove(&key) {
583                self.manifest.docs.remove(&rec.doc);
584                self.manifest.deleted.insert(rec.doc);
585                report.removed += 1;
586            }
587        }
588
589        // Additions / modifications: collect the docs to put in a new segment.
590        // term -> (doc -> tf)
591        let mut postings: BTreeMap<String, BTreeMap<u64, u32>> = BTreeMap::new();
592        for f in current {
593            let unchanged = self
594                .manifest
595                .files
596                .get(&f.key)
597                .is_some_and(|r| r.mtime_ns == f.mtime_ns && r.size == f.size);
598            if unchanged {
599                continue;
600            }
601            let is_change = self.manifest.files.contains_key(&f.key);
602            // Supersede any prior doc for this key.
603            if let Some(old) = self.manifest.files.get(&f.key) {
604                self.manifest.docs.remove(&old.doc);
605                self.manifest.deleted.insert(old.doc);
606            }
607            let src = load(f)?;
608            let doc = self.manifest.next_doc;
609            self.manifest.next_doc += 1;
610            let terms = tokenize(&format!(
611                "{} {} {} {}",
612                src.title,
613                src.type_,
614                src.tags.join(" "),
615                src.text
616            ));
617            let len = terms.len() as u32;
618            for term in terms {
619                *postings.entry(term).or_default().entry(doc).or_insert(0) += 1;
620            }
621            self.manifest.docs.insert(
622                doc,
623                DocMeta {
624                    key: f.key.clone(),
625                    title: src.title,
626                    type_: src.type_,
627                    tags: src.tags,
628                    len,
629                },
630            );
631            self.manifest.files.insert(
632                f.key.clone(),
633                FileRec {
634                    doc,
635                    mtime_ns: f.mtime_ns,
636                    size: f.size,
637                },
638            );
639            if is_change {
640                report.changed += 1;
641            } else {
642                report.added += 1;
643            }
644        }
645
646        if !postings.is_empty() {
647            let num = self
648                .manifest
649                .segments
650                .iter()
651                .copied()
652                .max()
653                .map(|n| n + 1)
654                .unwrap_or(0);
655            self.write_segment(num, &postings)?;
656            self.manifest.segments.push(num);
657            report.wrote_segment = true;
658        }
659        Ok(report)
660    }
661
662    /// Write a segment from a sorted `term -> (doc -> tf)` map.
663    fn write_segment(
664        &self,
665        num: u32,
666        postings: &BTreeMap<String, BTreeMap<u64, u32>>,
667    ) -> Result<(), String> {
668        std::fs::create_dir_all(&self.dir).map_err(|e| format!("{}: {e}", self.dir.display()))?;
669        let (fst_path, pos_path) = self.seg_paths(num);
670        let mut pos_blob = Vec::new();
671        let wtr = std::io::BufWriter::new(
672            std::fs::File::create(&fst_path).map_err(|e| format!("{}: {e}", fst_path.display()))?,
673        );
674        let mut builder = MapBuilder::new(wtr).map_err(|e| e.to_string())?;
675        // BTreeMap iterates terms in sorted order, as fst requires.
676        for (term, docs) in postings {
677            let list: Vec<(u64, u32)> = docs.iter().map(|(&d, &tf)| (d, tf)).collect();
678            let off = encode_postings(&mut pos_blob, &list);
679            builder
680                .insert(term.as_bytes(), off)
681                .map_err(|e| e.to_string())?;
682        }
683        builder.finish().map_err(|e| e.to_string())?;
684        std::fs::write(&pos_path, &pos_blob).map_err(|e| format!("{}: {e}", pos_path.display()))?;
685        Ok(())
686    }
687
688    /// Search the index, returning up to `limit` hits ranked by a tf-idf score.
689    /// Reads every live segment; tombstoned documents are filtered out.
690    pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>, String> {
691        let terms = parse_query(query);
692        if terms.is_empty() {
693            return Ok(Vec::new());
694        }
695        let segments: Vec<Segment> = self
696            .manifest
697            .segments
698            .iter()
699            .map(|&n| self.load_segment(n))
700            .collect::<Result<_, _>>()?;
701        let n_docs = self.manifest.docs.len().max(1) as f32;
702
703        let deleted = &self.manifest.deleted;
704        let mut scores: HashMap<u64, f32> = HashMap::new();
705        for qt in &terms {
706            // Build the mode's automaton once, then walk every segment with it,
707            // merging each matched dictionary term's postings — so document
708            // frequency (and thus idf) is computed over the whole index.
709            let mut merged: HashMap<String, Vec<(u64, u32)>> = HashMap::new();
710            match qt {
711                QueryTerm::Exact(t) => {
712                    collect_matches(&segments, &Str::new(t), deleted, &mut merged)
713                }
714                QueryTerm::Prefix(t) => {
715                    collect_matches(&segments, &Str::new(t).starts_with(), deleted, &mut merged)
716                }
717                QueryTerm::Fuzzy(t, dist) => {
718                    if let Ok(lev) = Levenshtein::new(t, *dist) {
719                        collect_matches(&segments, &lev, deleted, &mut merged);
720                    }
721                }
722                QueryTerm::Regex(pat) => {
723                    if let Ok(dfa) = DfaAutomaton::new(pat) {
724                        collect_matches(&segments, &dfa, deleted, &mut merged);
725                    }
726                }
727            }
728            for list in merged.values() {
729                let df = list.len().max(1) as f32;
730                let idf = (1.0 + n_docs / df).ln();
731                for &(doc, tf) in list {
732                    *scores.entry(doc).or_insert(0.0) += idf * (1.0 + (tf as f32).ln());
733                }
734            }
735        }
736
737        let mut hits: Vec<SearchHit> = scores
738            .into_iter()
739            .filter_map(|(doc, score)| {
740                self.manifest.docs.get(&doc).map(|m| SearchHit {
741                    key: m.key.clone(),
742                    title: m.title.clone(),
743                    type_: m.type_.clone(),
744                    tags: m.tags.clone(),
745                    score,
746                })
747            })
748            .collect();
749        // Highest score first; ties broken by key for deterministic output.
750        hits.sort_by(|a, b| {
751            b.score
752                .partial_cmp(&a.score)
753                .unwrap_or(std::cmp::Ordering::Equal)
754                .then_with(|| a.key.cmp(&b.key))
755        });
756        hits.truncate(limit);
757        Ok(hits)
758    }
759
760    /// Merge every segment into one, dropping tombstoned documents and their
761    /// postings, then delete the old segment files. Reclaims space without
762    /// re-reading any source file. A no-op when there is nothing to gain
763    /// (≤1 segment and no tombstones).
764    pub fn condense(&mut self) -> Result<bool, String> {
765        if self.manifest.segments.len() <= 1 && self.manifest.deleted.is_empty() {
766            return Ok(false);
767        }
768        let old_segments = self.manifest.segments.clone();
769        let segments: Vec<Segment> = old_segments
770            .iter()
771            .map(|&n| self.load_segment(n))
772            .collect::<Result<_, _>>()?;
773
774        // Rebuild a single term -> (doc -> tf) map from the live postings.
775        let mut postings: BTreeMap<String, BTreeMap<u64, u32>> = BTreeMap::new();
776        for seg in &segments {
777            let mut s = seg.map.stream();
778            while let Some((term_bytes, off)) = s.next() {
779                let Ok(term) = std::str::from_utf8(term_bytes) else {
780                    continue;
781                };
782                for (doc, tf) in decode_postings(&seg.pos, off) {
783                    if self.manifest.deleted.contains(&doc) {
784                        continue;
785                    }
786                    *postings
787                        .entry(term.to_string())
788                        .or_default()
789                        .entry(doc)
790                        .or_insert(0) += tf;
791                }
792            }
793        }
794
795        let num = old_segments.iter().copied().max().unwrap_or(0) + 1;
796        if !postings.is_empty() {
797            self.write_segment(num, &postings)?;
798            self.manifest.segments = vec![num];
799        } else {
800            self.manifest.segments.clear();
801        }
802        self.manifest.deleted.clear();
803        for old in old_segments {
804            let (fst_path, pos_path) = self.seg_paths(old);
805            let _ = std::fs::remove_file(fst_path);
806            let _ = std::fs::remove_file(pos_path);
807        }
808        Ok(true)
809    }
810
811    /// Discard the whole index (segments + manifest state), so the next
812    /// [`update`](Index::update) re-indexes every file from scratch.
813    pub fn reset(&mut self) {
814        for num in std::mem::take(&mut self.manifest.segments) {
815            let (fst_path, pos_path) = self.seg_paths(num);
816            let _ = std::fs::remove_file(fst_path);
817            let _ = std::fs::remove_file(pos_path);
818        }
819        self.manifest = Manifest::default();
820    }
821
822    /// Persist `manifest.json` (creating the index directory if needed).
823    pub fn save(&self) -> Result<(), String> {
824        std::fs::create_dir_all(&self.dir).map_err(|e| format!("{}: {e}", self.dir.display()))?;
825        let path = self.dir.join("manifest.json");
826        let text =
827            serde_json::to_string_pretty(&self.manifest.to_json()).map_err(|e| e.to_string())?;
828        std::fs::write(&path, text).map_err(|e| format!("{}: {e}", path.display()))
829    }
830}
831
832#[cfg(test)]
833mod tests {
834    use super::*;
835    use std::sync::atomic::{AtomicU32, Ordering};
836
837    static TAG: AtomicU32 = AtomicU32::new(0);
838
839    /// A fresh, empty scratch dir for one test (cleared if a prior run left it).
840    fn scratch() -> PathBuf {
841        let n = TAG.fetch_add(1, Ordering::Relaxed);
842        let dir = std::env::temp_dir().join(format!("ct-okfindex-{}-{n}", std::process::id()));
843        let _ = std::fs::remove_dir_all(&dir);
844        std::fs::create_dir_all(&dir).unwrap();
845        dir
846    }
847
848    fn stat(key: &str, mtime_ns: u64) -> FileStat {
849        FileStat {
850            key: key.to_string(),
851            path: PathBuf::from(key),
852            mtime_ns,
853            size: mtime_ns,
854        }
855    }
856
857    fn doc(title: &str, type_: &str, tags: &[&str], text: &str) -> DocSource {
858        DocSource {
859            title: title.to_string(),
860            type_: type_.to_string(),
861            tags: tags.iter().map(|s| s.to_string()).collect(),
862            text: text.to_string(),
863        }
864    }
865
866    #[test]
867    fn tokenize_lowercases_and_splits_on_nonalnum() {
868        assert_eq!(
869            tokenize("Hello, World! foo_bar"),
870            ["hello", "world", "foo", "bar"]
871        );
872        assert_eq!(tokenize("  "), Vec::<String>::new());
873    }
874
875    #[test]
876    fn parse_query_recognizes_all_modes() {
877        let q = parse_query("plain data* typo~ deep~2 /sch.*ma/");
878        assert_eq!(q[0], QueryTerm::Exact("plain".into()));
879        assert_eq!(q[1], QueryTerm::Prefix("data".into()));
880        assert_eq!(q[2], QueryTerm::Fuzzy("typo".into(), 1));
881        assert_eq!(q[3], QueryTerm::Fuzzy("deep".into(), 2));
882        assert_eq!(q[4], QueryTerm::Regex("sch.*ma".into()));
883    }
884
885    #[test]
886    fn varint_roundtrips() {
887        for v in [0u64, 1, 127, 128, 300, 16384, u32::MAX as u64, u64::MAX] {
888            let mut buf = Vec::new();
889            put_uvarint(&mut buf, v);
890            let mut pos = 0;
891            assert_eq!(get_uvarint(&buf, &mut pos), Some(v));
892            assert_eq!(pos, buf.len());
893        }
894    }
895
896    #[test]
897    fn index_search_exact_prefix_and_fuzzy() {
898        let dir = scratch();
899        let mut idx = Index::open(&dir).unwrap();
900        let files = [stat("a.md", 1), stat("b.md", 1)];
901        idx.update(&files, |f| {
902            Ok(match f.key.as_str() {
903                "a.md" => doc(
904                    "Customers",
905                    "BigQuery Table",
906                    &["pii"],
907                    "the customers dimension table",
908                ),
909                _ => doc(
910                    "Orders",
911                    "BigQuery Table",
912                    &["core"],
913                    "the orders fact table schema",
914                ),
915            })
916        })
917        .unwrap();
918        idx.save().unwrap();
919
920        // Exact term hits only the doc that has it.
921        let hits = idx.search("customers", 10).unwrap();
922        assert_eq!(hits.len(), 1);
923        assert_eq!(hits[0].key, "a.md");
924
925        // Prefix matches both ("table" appears in both via type/text).
926        let hits = idx.search("custom*", 10).unwrap();
927        assert_eq!(
928            hits.iter().map(|h| h.key.as_str()).collect::<Vec<_>>(),
929            ["a.md"]
930        );
931
932        // Fuzzy tolerates a typo.
933        let hits = idx.search("ordrs~", 10).unwrap();
934        assert_eq!(hits[0].key, "b.md");
935
936        // Reopen and the persisted index still searches.
937        let idx2 = Index::open(&dir).unwrap();
938        assert_eq!(idx2.doc_count(), 2);
939        assert_eq!(idx2.search("schema", 10).unwrap()[0].key, "b.md");
940    }
941
942    #[test]
943    fn regex_mode_matches_substrings_via_the_dfa_adapter() {
944        let dir = scratch();
945        let mut idx = Index::open(&dir).unwrap();
946        idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
947            Ok(match f.key.as_str() {
948                "a.md" => doc("Orders", "Table", &[], "the orders fact table"),
949                _ => doc("Customers", "Table", &[], "the customers dimension"),
950            })
951        })
952        .unwrap();
953
954        // /.*omer.*/ matches the term "customers" (substring) but not "orders".
955        let hits = idx.search("/.*omer.*/", 10).unwrap();
956        assert_eq!(
957            hits.iter().map(|h| h.key.as_str()).collect::<Vec<_>>(),
958            ["b.md"]
959        );
960
961        // An anchored whole-key pattern still works.
962        let hits = idx.search("/orders/", 10).unwrap();
963        assert_eq!(hits[0].key, "a.md");
964
965        // A pattern matching no term yields nothing (and does not error).
966        assert!(idx.search("/zzz.*/", 10).unwrap().is_empty());
967    }
968
969    #[test]
970    fn incremental_update_supersedes_changed_and_drops_removed() {
971        let dir = scratch();
972        let mut idx = Index::open(&dir).unwrap();
973        idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
974            Ok(if f.key == "a.md" {
975                doc("Alpha", "Note", &[], "alpha content markalpha")
976            } else {
977                doc("Beta", "Note", &[], "beta content markbeta")
978            })
979        })
980        .unwrap();
981
982        // Change a.md (new mtime) and remove b.md.
983        let report = idx
984            .update(&[stat("a.md", 2)], |_| {
985                Ok(doc("Alpha", "Note", &[], "rewritten markgamma"))
986            })
987            .unwrap();
988        assert_eq!((report.changed, report.removed, report.added), (1, 1, 0));
989        assert_eq!(idx.doc_count(), 1);
990        assert_eq!(idx.segment_count(), 2); // a second segment was layered on
991        assert_eq!(idx.tombstone_count(), 2); // old a.md doc + removed b.md
992
993        // The old content is gone; the new content is found; b.md is gone.
994        assert!(idx.search("markalpha", 10).unwrap().is_empty());
995        assert!(idx.search("markbeta", 10).unwrap().is_empty());
996        assert_eq!(idx.search("markgamma", 10).unwrap()[0].key, "a.md");
997
998        // No-op update writes no new segment.
999        let report = idx
1000            .update(&[stat("a.md", 2)], |_| {
1001                panic!("should not load unchanged file")
1002            })
1003            .unwrap();
1004        assert!(report.is_empty());
1005        assert_eq!(idx.segment_count(), 2);
1006    }
1007
1008    #[test]
1009    fn condense_merges_segments_and_drops_tombstones() {
1010        let dir = scratch();
1011        let mut idx = Index::open(&dir).unwrap();
1012        idx.update(&[stat("a.md", 1)], |_| {
1013            Ok(doc("A", "Note", &[], "first markone"))
1014        })
1015        .unwrap();
1016        idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
1017            Ok(if f.key == "b.md" {
1018                doc("B", "Note", &[], "second marktwo")
1019            } else {
1020                doc("A", "Note", &[], "first markone")
1021            })
1022        })
1023        .unwrap();
1024        // Force a tombstone by rewriting a.md.
1025        idx.update(&[stat("a.md", 9), stat("b.md", 1)], |f| {
1026            Ok(if f.key == "a.md" {
1027                doc("A", "Note", &[], "third markthree")
1028            } else {
1029                doc("B", "Note", &[], "second marktwo")
1030            })
1031        })
1032        .unwrap();
1033        assert!(idx.segment_count() >= 2);
1034        assert!(idx.tombstone_count() >= 1);
1035
1036        assert!(idx.condense().unwrap());
1037        assert_eq!(idx.segment_count(), 1);
1038        assert_eq!(idx.tombstone_count(), 0);
1039        assert_eq!(idx.doc_count(), 2);
1040
1041        // Live content still searchable; superseded content gone.
1042        assert_eq!(idx.search("markthree", 10).unwrap()[0].key, "a.md");
1043        assert_eq!(idx.search("marktwo", 10).unwrap()[0].key, "b.md");
1044        assert!(idx.search("markone", 10).unwrap().is_empty());
1045
1046        // Old segment files were removed.
1047        let leftover = std::fs::read_dir(&dir)
1048            .unwrap()
1049            .filter_map(|e| e.ok())
1050            .filter(|e| e.path().extension().is_some_and(|x| x == "fst"))
1051            .count();
1052        assert_eq!(leftover, 1);
1053    }
1054
1055    #[test]
1056    fn pending_counts_added_changed_removed_without_mutating() {
1057        let dir = scratch();
1058        let mut idx = Index::open(&dir).unwrap();
1059        idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
1060            Ok(doc(&f.key, "Note", &[], "content"))
1061        })
1062        .unwrap();
1063        // a unchanged, b changed (new mtime), c new -> (1 added, 1 changed, 0 removed).
1064        assert_eq!(
1065            idx.pending(&[stat("a.md", 1), stat("b.md", 2), stat("c.md", 1)]),
1066            (1, 1, 0)
1067        );
1068        // Omitting b makes it a removal.
1069        assert_eq!(idx.pending(&[stat("a.md", 1)]), (0, 0, 1));
1070        // pending is read-only.
1071        assert_eq!(idx.doc_count(), 2);
1072    }
1073
1074    #[test]
1075    fn reset_clears_then_reindexes_from_scratch() {
1076        let dir = scratch();
1077        let mut idx = Index::open(&dir).unwrap();
1078        idx.update(&[stat("a.md", 1)], |_| {
1079            Ok(doc("A", "Note", &[], "unique markx"))
1080        })
1081        .unwrap();
1082        idx.save().unwrap();
1083        assert_eq!(idx.doc_count(), 1);
1084
1085        idx.reset();
1086        assert_eq!((idx.doc_count(), idx.segment_count()), (0, 0));
1087        assert!(idx.search("markx", 10).unwrap().is_empty());
1088
1089        // Indexing works again after a reset.
1090        idx.update(&[stat("a.md", 1)], |_| {
1091            Ok(doc("A", "Note", &[], "unique marky"))
1092        })
1093        .unwrap();
1094        assert_eq!(idx.search("marky", 10).unwrap()[0].key, "a.md");
1095    }
1096
1097    #[test]
1098    fn search_ranks_more_relevant_documents_higher() {
1099        let dir = scratch();
1100        let mut idx = Index::open(&dir).unwrap();
1101        idx.update(&[stat("hi.md", 1), stat("lo.md", 1)], |f| {
1102            Ok(if f.key == "hi.md" {
1103                doc("Hi", "Note", &[], "alpha alpha alpha beta")
1104            } else {
1105                doc("Lo", "Note", &[], "alpha gamma delta epsilon")
1106            })
1107        })
1108        .unwrap();
1109        let hits = idx.search("alpha", 10).unwrap();
1110        assert_eq!(hits.len(), 2);
1111        assert_eq!(
1112            hits[0].key, "hi.md",
1113            "the higher term frequency should rank first"
1114        );
1115        assert!(hits[0].score > hits[1].score);
1116    }
1117
1118    #[test]
1119    fn empty_query_returns_no_hits() {
1120        let dir = scratch();
1121        let mut idx = Index::open(&dir).unwrap();
1122        idx.update(&[stat("a.md", 1)], |_| Ok(doc("A", "Note", &[], "content")))
1123            .unwrap();
1124        assert!(idx.search("", 10).unwrap().is_empty());
1125        assert!(idx.search("   ", 10).unwrap().is_empty());
1126    }
1127
1128    #[test]
1129    fn search_merges_postings_across_segments() {
1130        let dir = scratch();
1131        let mut idx = Index::open(&dir).unwrap();
1132        // Two updates -> two segments, each holding a doc with the term "shared".
1133        idx.update(&[stat("a.md", 1)], |_| {
1134            Ok(doc("A", "Note", &[], "shared onlya"))
1135        })
1136        .unwrap();
1137        idx.update(&[stat("a.md", 1), stat("b.md", 1)], |f| {
1138            Ok(if f.key == "b.md" {
1139                doc("B", "Note", &[], "shared onlyb")
1140            } else {
1141                doc("A", "Note", &[], "shared onlya")
1142            })
1143        })
1144        .unwrap();
1145        assert!(idx.segment_count() >= 2);
1146        let hits = idx.search("shared", 10).unwrap();
1147        let keys: BTreeSet<&str> = hits.iter().map(|h| h.key.as_str()).collect();
1148        assert!(keys.contains("a.md") && keys.contains("b.md"), "{keys:?}");
1149    }
1150}
coding_tools/okfindex.rs

coding_tools/
okfindex.rs