Skip to main content

sift_core/index/
mod.rs

1//! Trigram index: build, load, search.
2
3mod builder;
4pub mod files;
5pub mod trigram;
6
7use std::cmp::Ordering;
8use std::cmp::Reverse;
9use std::collections::BinaryHeap;
10use std::path::{Path, PathBuf};
11
12pub use builder::build_index_tables;
13use serde::{Deserialize, Serialize};
14
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct QueryPlan {
17    pub pattern: String,
18    pub mode: &'static str,
19}
20
21/// In-memory trigram index backed by memory-mapped storage.
22///
23/// All data is accessed zero-copy from mapped files. Opening an index is cheap
24/// — just memory-mapping the three index files, no deserialization.
25#[derive(Debug)]
26pub struct Index {
27    pub root: PathBuf,
28    pub corpus_kind: CorpusKind,
29    files: files::MappedFilesView,
30    file_paths: Vec<PathBuf>,
31    lexicon: crate::storage::lexicon::MappedLexicon,
32    postings: crate::storage::postings::MappedPostings,
33    pub index_dir: Option<PathBuf>,
34}
35
36#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
37#[serde(tag = "kind", rename_all = "snake_case")]
38pub enum CorpusKind {
39    Directory,
40    File { entries: Vec<PathBuf> },
41}
42
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub struct IndexMeta {
45    pub root: PathBuf,
46    #[serde(flatten)]
47    pub kind: CorpusKind,
48}
49
50impl IndexMeta {
51    const fn new(root: PathBuf, kind: CorpusKind) -> Self {
52        Self { root, kind }
53    }
54
55    fn validate(self, meta_path: &Path) -> crate::Result<Self> {
56        if !self.root.is_absolute() {
57            return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
58        }
59        match &self.kind {
60            CorpusKind::Directory => {}
61            CorpusKind::File { entries } => {
62                if entries.len() != 1
63                    || entries.iter().any(|entry| {
64                        entry.as_os_str().is_empty()
65                            || entry.is_absolute()
66                            || entry.components().count() != 1
67                    })
68                {
69                    return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
70                }
71            }
72        }
73        Ok(self)
74    }
75}
76
77fn validate_file_paths(
78    kind: &CorpusKind,
79    file_paths: &[PathBuf],
80    meta_path: &Path,
81) -> crate::Result<()> {
82    match kind {
83        CorpusKind::Directory => {
84            if file_paths
85                .iter()
86                .any(|path| path.as_os_str().is_empty() || path.is_absolute())
87            {
88                return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
89            }
90        }
91        CorpusKind::File { entries } => {
92            if file_paths != entries {
93                return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
94            }
95        }
96    }
97    Ok(())
98}
99
100impl Index {
101    /// Open an index directory produced by [`IndexBuilder::build`].
102    ///
103    /// # Errors
104    ///
105    /// Returns [`crate::Error::MissingMeta`] if `sift.meta` is absent,
106    /// [`crate::Error::InvalidMeta`] if metadata is empty or malformed,
107    /// [`crate::Error::MissingComponent`] if a trigram table file is missing,
108    /// or [`crate::Error::Io`] on read/mmap failure.
109    pub fn open(path: &Path) -> crate::Result<Self> {
110        let sift_dir = path.to_path_buf();
111        let index_dir = sift_dir.join(crate::INDEX_SUBDIR);
112        let meta_path = sift_dir.join(crate::META_FILENAME);
113        if !meta_path.is_file() {
114            return Err(crate::Error::MissingMeta(meta_path));
115        }
116        let raw = std::fs::read_to_string(&meta_path)?;
117        let meta = serde_json::from_str::<IndexMeta>(&raw)
118            .map_err(|_| crate::Error::InvalidMeta(meta_path.clone()))?
119            .validate(&meta_path)?;
120        let paths = [
121            index_dir.join(crate::FILES_BIN),
122            index_dir.join(crate::LEXICON_BIN),
123            index_dir.join(crate::POSTINGS_BIN),
124        ];
125        for p in &paths {
126            if !p.is_file() {
127                return Err(crate::Error::MissingComponent(p.clone()));
128            }
129        }
130
131        let files = files::MappedFilesView::open(&paths[0]).map_err(crate::Error::Io)?;
132        let file_paths = files.to_path_bufs().map_err(crate::Error::Io)?;
133        validate_file_paths(&meta.kind, &file_paths, &meta_path)?;
134        let lexicon =
135            crate::storage::lexicon::MappedLexicon::open(&paths[1]).map_err(crate::Error::Io)?;
136        let postings =
137            crate::storage::postings::MappedPostings::open(&paths[2]).map_err(crate::Error::Io)?;
138
139        Ok(Self {
140            root: meta.root,
141            corpus_kind: meta.kind,
142            files,
143            file_paths,
144            lexicon,
145            postings,
146            index_dir: Some(sift_dir),
147        })
148    }
149
150    /// Persist the in-memory index to `dir`.
151    ///
152    /// # Errors
153    ///
154    /// Propagates IO errors from creating directories or writing files.
155    pub fn save_to_dir(&self, dir: &Path) -> crate::Result<()> {
156        std::fs::create_dir_all(dir)?;
157        let meta_path = dir.join(crate::META_FILENAME);
158        let meta = IndexMeta::new(self.root.clone(), self.corpus_kind.clone());
159        std::fs::write(
160            &meta_path,
161            serde_json::to_vec_pretty(&meta)
162                .map_err(|_| crate::Error::InvalidMeta(meta_path.clone()))?,
163        )?;
164
165        let index_dir = dir.join(crate::INDEX_SUBDIR);
166        std::fs::create_dir_all(&index_dir)?;
167        std::fs::write(index_dir.join(crate::FILES_BIN), self.files.backing_slice())
168            .map_err(crate::Error::Io)?;
169        std::fs::write(
170            index_dir.join(crate::LEXICON_BIN),
171            self.lexicon.backing_slice(),
172        )
173        .map_err(crate::Error::Io)?;
174        std::fs::write(
175            index_dir.join(crate::POSTINGS_BIN),
176            self.postings.backing_slice(),
177        )
178        .map_err(crate::Error::Io)?;
179        Ok(())
180    }
181
182    #[must_use]
183    pub fn index_dir(&self) -> Option<&Path> {
184        self.index_dir.as_deref()
185    }
186
187    #[must_use]
188    pub fn explain(&self, pattern: &str) -> QueryPlan {
189        let mode = match crate::planner::TrigramPlan::for_patterns(
190            &[pattern.to_string()],
191            &crate::SearchOptions::default(),
192        ) {
193            crate::planner::TrigramPlan::FullScan => "full_scan",
194            crate::planner::TrigramPlan::Narrow { .. } => "indexed_candidates",
195        };
196        QueryPlan {
197            pattern: pattern.to_string(),
198            mode,
199        }
200    }
201
202    #[must_use]
203    pub fn posting_bytes_slice(&self, tri: [u8; 3]) -> &[u8] {
204        let Some(entry) = self.lexicon.get(tri) else {
205            return &[];
206        };
207        let start = usize::try_from(entry.offset).unwrap_or(usize::MAX);
208        let n = usize::try_from(entry.len).unwrap_or(usize::MAX);
209        let nbytes = n.saturating_mul(4);
210        self.postings.slice(start, nbytes)
211    }
212
213    /// Get sorted file IDs for a trigram. Materializes from mapped bytes.
214    ///
215    /// # Panics
216    ///
217    /// Panics if postings data for this trigram is corrupted.
218    #[must_use]
219    pub fn posting_list_for_trigram(&self, tri: [u8; 3]) -> Vec<u32> {
220        let slice = self.posting_bytes_slice(tri);
221        if !slice.len().is_multiple_of(4) {
222            return Vec::new();
223        }
224        slice
225            .chunks_exact(4)
226            .map(|c| u32::from_le_bytes(c.try_into().unwrap()))
227            .collect()
228    }
229
230    #[must_use]
231    pub fn candidate_file_ids(&self, arms: &[crate::planner::Arm]) -> Vec<u32> {
232        let mut id_lists: Vec<Vec<u32>> = Vec::with_capacity(arms.len());
233        for arm in arms {
234            if arm.is_empty() {
235                continue;
236            }
237            let mut slices: Vec<&[u8]> = arm
238                .iter()
239                .map(|tri| self.posting_bytes_slice(*tri))
240                .collect();
241            if slices.iter().any(|s| s.is_empty()) {
242                continue;
243            }
244            slices.sort_unstable_by_key(|slice| slice.len());
245            let ids = intersect_sorted_posting_byte_slices(&slices);
246            if !ids.is_empty() {
247                id_lists.push(ids);
248            }
249        }
250        merge_sorted_runs(id_lists)
251    }
252
253    #[must_use]
254    pub fn candidate_paths(&self, arms: &[crate::planner::Arm]) -> Vec<PathBuf> {
255        self.candidate_file_ids(arms)
256            .iter()
257            .filter_map(|&id| self.file_paths.get(id as usize).cloned())
258            .collect()
259    }
260
261    #[must_use]
262    pub fn file_path(&self, id: usize) -> Option<&Path> {
263        self.file_paths.get(id).map(PathBuf::as_path)
264    }
265
266    #[must_use]
267    pub const fn file_count(&self) -> usize {
268        self.files.len()
269    }
270
271    pub fn iter_files(&self) -> impl Iterator<Item = &Path> {
272        self.file_paths.iter().map(PathBuf::as_path)
273    }
274}
275
276fn intersect_sorted_posting_byte_slices(slices: &[&[u8]]) -> Vec<u32> {
277    if slices.is_empty() {
278        return Vec::new();
279    }
280    if slices.len() == 1 {
281        return u32_vec_from_le_bytes(slices[0]);
282    }
283    let mut cur = intersect_two_posting_bytes(slices[0], slices[1]);
284    for s in &slices[2..] {
285        cur = intersect_vec_with_posting_bytes(&cur, s);
286        if cur.is_empty() {
287            break;
288        }
289    }
290    cur
291}
292
293fn intersect_two_posting_bytes(a: &[u8], b: &[u8]) -> Vec<u32> {
294    if !a.len().is_multiple_of(4) || !b.len().is_multiple_of(4) {
295        return Vec::new();
296    }
297    let an = a.len() / 4;
298    let bn = b.len() / 4;
299    let mut i = 0usize;
300    let mut j = 0usize;
301    let mut out = Vec::new();
302    while i < an && j < bn {
303        let ai = u32::from_le_bytes(a[i * 4..i * 4 + 4].try_into().unwrap());
304        let bj = u32::from_le_bytes(b[j * 4..j * 4 + 4].try_into().unwrap());
305        match ai.cmp(&bj) {
306            Ordering::Less => i += 1,
307            Ordering::Greater => j += 1,
308            Ordering::Equal => {
309                out.push(ai);
310                i += 1;
311                j += 1;
312            }
313        }
314    }
315    out
316}
317
318fn intersect_vec_with_posting_bytes(cur: &[u32], b: &[u8]) -> Vec<u32> {
319    if !b.len().is_multiple_of(4) {
320        return Vec::new();
321    }
322    let bn = b.len() / 4;
323    let mut i = 0usize;
324    let mut j = 0usize;
325    let mut out = Vec::new();
326    while i < cur.len() && j < bn {
327        let bj = u32::from_le_bytes(b[j * 4..j * 4 + 4].try_into().unwrap());
328        match cur[i].cmp(&bj) {
329            Ordering::Less => i += 1,
330            Ordering::Greater => j += 1,
331            Ordering::Equal => {
332                out.push(cur[i]);
333                i += 1;
334                j += 1;
335            }
336        }
337    }
338    out
339}
340
341fn u32_vec_from_le_bytes(slice: &[u8]) -> Vec<u32> {
342    if !slice.len().is_multiple_of(4) {
343        return Vec::new();
344    }
345    slice
346        .chunks_exact(4)
347        .map(|c| u32::from_le_bytes(c.try_into().unwrap()))
348        .collect()
349}
350
351fn merge_sorted_runs(lists: Vec<Vec<u32>>) -> Vec<u32> {
352    if lists.is_empty() {
353        return Vec::new();
354    }
355    if lists.len() == 1 {
356        return lists.into_iter().next().unwrap_or_default();
357    }
358
359    let total: usize = lists.iter().map(Vec::len).sum();
360    let mut heap: BinaryHeap<Reverse<(u32, usize)>> = BinaryHeap::with_capacity(lists.len());
361    let mut positions = vec![0usize; lists.len()];
362
363    for (list_idx, list) in lists.iter().enumerate() {
364        if let Some(&first) = list.first() {
365            heap.push(Reverse((first, list_idx)));
366        }
367    }
368
369    let mut out = Vec::with_capacity(total);
370    let mut last = None;
371    while let Some(Reverse((value, list_idx))) = heap.pop() {
372        if last != Some(value) {
373            out.push(value);
374            last = Some(value);
375        }
376
377        positions[list_idx] += 1;
378        if let Some(&next) = lists[list_idx].get(positions[list_idx]) {
379            heap.push(Reverse((next, list_idx)));
380        }
381    }
382    out
383}
384
385pub struct IndexBuilder<'a> {
386    root: &'a Path,
387    dir: Option<PathBuf>,
388}
389
390impl<'a> IndexBuilder<'a> {
391    #[must_use]
392    pub const fn new(root: &'a Path) -> Self {
393        Self { root, dir: None }
394    }
395
396    #[must_use]
397    pub fn with_dir(mut self, dir: impl Into<PathBuf>) -> Self {
398        self.dir = Some(dir.into());
399        self
400    }
401
402    /// Walk `root`, extract trigrams, and return an in-memory [`Index`].
403    ///
404    /// # Errors
405    ///
406    /// Propagates IO errors from walking, reading files, or writing persistence files
407    /// (if `with_dir` was called).
408    pub fn build(self) -> crate::Result<Index> {
409        let canonical = self.root.canonicalize()?;
410        let (root, build_root) = if canonical.is_file() {
411            let parent = canonical.parent().ok_or_else(|| {
412                crate::Error::Io(std::io::Error::new(
413                    std::io::ErrorKind::InvalidInput,
414                    "single-file corpus must have a parent directory",
415                ))
416            })?;
417            (parent.to_path_buf(), canonical)
418        } else {
419            (canonical.clone(), canonical)
420        };
421        let (corpus_kind, tables) = build_index_tables(&build_root)?;
422
423        let files = files::MappedFilesView::from_paths(&tables.files);
424        let lexicon = crate::storage::lexicon::MappedLexicon::from_entries(&tables.lexicon);
425        let postings = crate::storage::postings::MappedPostings::from_bytes(&tables.postings);
426
427        let mut index = Index {
428            root,
429            corpus_kind,
430            files,
431            file_paths: tables.files,
432            lexicon,
433            postings,
434            index_dir: None,
435        };
436
437        if let Some(dir) = self.dir {
438            index.index_dir = Some(dir.clone());
439            index.save_to_dir(&dir)?;
440        }
441        Ok(index)
442    }
443}
444
445#[cfg(test)]
446mod tests {
447    use super::{intersect_sorted_posting_byte_slices, merge_sorted_runs};
448
449    fn bytes(ids: &[u32]) -> Vec<u8> {
450        ids.iter().flat_map(|id| id.to_le_bytes()).collect()
451    }
452
453    #[test]
454    fn merge_sorted_runs_preserves_order_and_uniqueness() {
455        let merged = merge_sorted_runs(vec![vec![1, 3, 7], vec![1, 2, 7, 9], vec![4, 7, 8]]);
456        assert_eq!(merged, vec![1, 2, 3, 4, 7, 8, 9]);
457    }
458
459    #[test]
460    fn intersect_sorted_posting_byte_slices_handles_smallest_first_order() {
461        let a = bytes(&[1, 3, 5, 7, 9]);
462        let b = bytes(&[3, 7]);
463        let c = bytes(&[0, 3, 4, 7, 8]);
464        let slices = vec![a.as_slice(), b.as_slice(), c.as_slice()];
465        let ids = intersect_sorted_posting_byte_slices(&slices);
466        assert_eq!(ids, vec![3, 7]);
467    }
468}