1mod builder;
4pub mod files;
5pub mod trigram;
6
7use std::cmp::Ordering;
8use std::cmp::Reverse;
9use std::collections::BinaryHeap;
10use std::path::{Path, PathBuf};
11
12pub use builder::build_index_tables;
13use serde::{Deserialize, Serialize};
14
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct QueryPlan {
17 pub pattern: String,
18 pub mode: &'static str,
19}
20
21#[derive(Debug)]
26pub struct Index {
27 pub root: PathBuf,
28 pub corpus_kind: CorpusKind,
29 files: files::MappedFilesView,
30 file_paths: Vec<PathBuf>,
31 lexicon: crate::storage::lexicon::MappedLexicon,
32 postings: crate::storage::postings::MappedPostings,
33 pub index_dir: Option<PathBuf>,
34}
35
36#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
37#[serde(tag = "kind", rename_all = "snake_case")]
38pub enum CorpusKind {
39 Directory,
40 File { entries: Vec<PathBuf> },
41}
42
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub struct IndexMeta {
45 pub root: PathBuf,
46 #[serde(flatten)]
47 pub kind: CorpusKind,
48}
49
50impl IndexMeta {
51 const fn new(root: PathBuf, kind: CorpusKind) -> Self {
52 Self { root, kind }
53 }
54
55 fn validate(self, meta_path: &Path) -> crate::Result<Self> {
56 if !self.root.is_absolute() {
57 return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
58 }
59 match &self.kind {
60 CorpusKind::Directory => {}
61 CorpusKind::File { entries } => {
62 if entries.len() != 1
63 || entries.iter().any(|entry| {
64 entry.as_os_str().is_empty()
65 || entry.is_absolute()
66 || entry.components().count() != 1
67 })
68 {
69 return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
70 }
71 }
72 }
73 Ok(self)
74 }
75}
76
77fn validate_file_paths(
78 kind: &CorpusKind,
79 file_paths: &[PathBuf],
80 meta_path: &Path,
81) -> crate::Result<()> {
82 match kind {
83 CorpusKind::Directory => {
84 if file_paths
85 .iter()
86 .any(|path| path.as_os_str().is_empty() || path.is_absolute())
87 {
88 return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
89 }
90 }
91 CorpusKind::File { entries } => {
92 if file_paths != entries {
93 return Err(crate::Error::InvalidMeta(meta_path.to_path_buf()));
94 }
95 }
96 }
97 Ok(())
98}
99
100impl Index {
101 pub fn open(path: &Path) -> crate::Result<Self> {
110 let sift_dir = path.to_path_buf();
111 let index_dir = sift_dir.join(crate::INDEX_SUBDIR);
112 let meta_path = sift_dir.join(crate::META_FILENAME);
113 if !meta_path.is_file() {
114 return Err(crate::Error::MissingMeta(meta_path));
115 }
116 let raw = std::fs::read_to_string(&meta_path)?;
117 let meta = serde_json::from_str::<IndexMeta>(&raw)
118 .map_err(|_| crate::Error::InvalidMeta(meta_path.clone()))?
119 .validate(&meta_path)?;
120 let paths = [
121 index_dir.join(crate::FILES_BIN),
122 index_dir.join(crate::LEXICON_BIN),
123 index_dir.join(crate::POSTINGS_BIN),
124 ];
125 for p in &paths {
126 if !p.is_file() {
127 return Err(crate::Error::MissingComponent(p.clone()));
128 }
129 }
130
131 let files = files::MappedFilesView::open(&paths[0]).map_err(crate::Error::Io)?;
132 let file_paths = files.to_path_bufs().map_err(crate::Error::Io)?;
133 validate_file_paths(&meta.kind, &file_paths, &meta_path)?;
134 let lexicon =
135 crate::storage::lexicon::MappedLexicon::open(&paths[1]).map_err(crate::Error::Io)?;
136 let postings =
137 crate::storage::postings::MappedPostings::open(&paths[2]).map_err(crate::Error::Io)?;
138
139 Ok(Self {
140 root: meta.root,
141 corpus_kind: meta.kind,
142 files,
143 file_paths,
144 lexicon,
145 postings,
146 index_dir: Some(sift_dir),
147 })
148 }
149
150 pub fn save_to_dir(&self, dir: &Path) -> crate::Result<()> {
156 std::fs::create_dir_all(dir)?;
157 let meta_path = dir.join(crate::META_FILENAME);
158 let meta = IndexMeta::new(self.root.clone(), self.corpus_kind.clone());
159 std::fs::write(
160 &meta_path,
161 serde_json::to_vec_pretty(&meta)
162 .map_err(|_| crate::Error::InvalidMeta(meta_path.clone()))?,
163 )?;
164
165 let index_dir = dir.join(crate::INDEX_SUBDIR);
166 std::fs::create_dir_all(&index_dir)?;
167 std::fs::write(index_dir.join(crate::FILES_BIN), self.files.backing_slice())
168 .map_err(crate::Error::Io)?;
169 std::fs::write(
170 index_dir.join(crate::LEXICON_BIN),
171 self.lexicon.backing_slice(),
172 )
173 .map_err(crate::Error::Io)?;
174 std::fs::write(
175 index_dir.join(crate::POSTINGS_BIN),
176 self.postings.backing_slice(),
177 )
178 .map_err(crate::Error::Io)?;
179 Ok(())
180 }
181
182 #[must_use]
183 pub fn index_dir(&self) -> Option<&Path> {
184 self.index_dir.as_deref()
185 }
186
187 #[must_use]
188 pub fn explain(&self, pattern: &str) -> QueryPlan {
189 let mode = match crate::planner::TrigramPlan::for_patterns(
190 &[pattern.to_string()],
191 &crate::SearchOptions::default(),
192 ) {
193 crate::planner::TrigramPlan::FullScan => "full_scan",
194 crate::planner::TrigramPlan::Narrow { .. } => "indexed_candidates",
195 };
196 QueryPlan {
197 pattern: pattern.to_string(),
198 mode,
199 }
200 }
201
202 #[must_use]
203 pub fn posting_bytes_slice(&self, tri: [u8; 3]) -> &[u8] {
204 let Some(entry) = self.lexicon.get(tri) else {
205 return &[];
206 };
207 let start = usize::try_from(entry.offset).unwrap_or(usize::MAX);
208 let n = usize::try_from(entry.len).unwrap_or(usize::MAX);
209 let nbytes = n.saturating_mul(4);
210 self.postings.slice(start, nbytes)
211 }
212
213 #[must_use]
219 pub fn posting_list_for_trigram(&self, tri: [u8; 3]) -> Vec<u32> {
220 let slice = self.posting_bytes_slice(tri);
221 if !slice.len().is_multiple_of(4) {
222 return Vec::new();
223 }
224 slice
225 .chunks_exact(4)
226 .map(|c| u32::from_le_bytes(c.try_into().unwrap()))
227 .collect()
228 }
229
230 #[must_use]
231 pub fn candidate_file_ids(&self, arms: &[crate::planner::Arm]) -> Vec<u32> {
232 let mut id_lists: Vec<Vec<u32>> = Vec::with_capacity(arms.len());
233 for arm in arms {
234 if arm.is_empty() {
235 continue;
236 }
237 let mut slices: Vec<&[u8]> = arm
238 .iter()
239 .map(|tri| self.posting_bytes_slice(*tri))
240 .collect();
241 if slices.iter().any(|s| s.is_empty()) {
242 continue;
243 }
244 slices.sort_unstable_by_key(|slice| slice.len());
245 let ids = intersect_sorted_posting_byte_slices(&slices);
246 if !ids.is_empty() {
247 id_lists.push(ids);
248 }
249 }
250 merge_sorted_runs(id_lists)
251 }
252
253 #[must_use]
254 pub fn candidate_paths(&self, arms: &[crate::planner::Arm]) -> Vec<PathBuf> {
255 self.candidate_file_ids(arms)
256 .iter()
257 .filter_map(|&id| self.file_paths.get(id as usize).cloned())
258 .collect()
259 }
260
261 #[must_use]
262 pub fn file_path(&self, id: usize) -> Option<&Path> {
263 self.file_paths.get(id).map(PathBuf::as_path)
264 }
265
266 #[must_use]
267 pub const fn file_count(&self) -> usize {
268 self.files.len()
269 }
270
271 pub fn iter_files(&self) -> impl Iterator<Item = &Path> {
272 self.file_paths.iter().map(PathBuf::as_path)
273 }
274}
275
276fn intersect_sorted_posting_byte_slices(slices: &[&[u8]]) -> Vec<u32> {
277 if slices.is_empty() {
278 return Vec::new();
279 }
280 if slices.len() == 1 {
281 return u32_vec_from_le_bytes(slices[0]);
282 }
283 let mut cur = intersect_two_posting_bytes(slices[0], slices[1]);
284 for s in &slices[2..] {
285 cur = intersect_vec_with_posting_bytes(&cur, s);
286 if cur.is_empty() {
287 break;
288 }
289 }
290 cur
291}
292
293fn intersect_two_posting_bytes(a: &[u8], b: &[u8]) -> Vec<u32> {
294 if !a.len().is_multiple_of(4) || !b.len().is_multiple_of(4) {
295 return Vec::new();
296 }
297 let an = a.len() / 4;
298 let bn = b.len() / 4;
299 let mut i = 0usize;
300 let mut j = 0usize;
301 let mut out = Vec::new();
302 while i < an && j < bn {
303 let ai = u32::from_le_bytes(a[i * 4..i * 4 + 4].try_into().unwrap());
304 let bj = u32::from_le_bytes(b[j * 4..j * 4 + 4].try_into().unwrap());
305 match ai.cmp(&bj) {
306 Ordering::Less => i += 1,
307 Ordering::Greater => j += 1,
308 Ordering::Equal => {
309 out.push(ai);
310 i += 1;
311 j += 1;
312 }
313 }
314 }
315 out
316}
317
318fn intersect_vec_with_posting_bytes(cur: &[u32], b: &[u8]) -> Vec<u32> {
319 if !b.len().is_multiple_of(4) {
320 return Vec::new();
321 }
322 let bn = b.len() / 4;
323 let mut i = 0usize;
324 let mut j = 0usize;
325 let mut out = Vec::new();
326 while i < cur.len() && j < bn {
327 let bj = u32::from_le_bytes(b[j * 4..j * 4 + 4].try_into().unwrap());
328 match cur[i].cmp(&bj) {
329 Ordering::Less => i += 1,
330 Ordering::Greater => j += 1,
331 Ordering::Equal => {
332 out.push(cur[i]);
333 i += 1;
334 j += 1;
335 }
336 }
337 }
338 out
339}
340
341fn u32_vec_from_le_bytes(slice: &[u8]) -> Vec<u32> {
342 if !slice.len().is_multiple_of(4) {
343 return Vec::new();
344 }
345 slice
346 .chunks_exact(4)
347 .map(|c| u32::from_le_bytes(c.try_into().unwrap()))
348 .collect()
349}
350
351fn merge_sorted_runs(lists: Vec<Vec<u32>>) -> Vec<u32> {
352 if lists.is_empty() {
353 return Vec::new();
354 }
355 if lists.len() == 1 {
356 return lists.into_iter().next().unwrap_or_default();
357 }
358
359 let total: usize = lists.iter().map(Vec::len).sum();
360 let mut heap: BinaryHeap<Reverse<(u32, usize)>> = BinaryHeap::with_capacity(lists.len());
361 let mut positions = vec![0usize; lists.len()];
362
363 for (list_idx, list) in lists.iter().enumerate() {
364 if let Some(&first) = list.first() {
365 heap.push(Reverse((first, list_idx)));
366 }
367 }
368
369 let mut out = Vec::with_capacity(total);
370 let mut last = None;
371 while let Some(Reverse((value, list_idx))) = heap.pop() {
372 if last != Some(value) {
373 out.push(value);
374 last = Some(value);
375 }
376
377 positions[list_idx] += 1;
378 if let Some(&next) = lists[list_idx].get(positions[list_idx]) {
379 heap.push(Reverse((next, list_idx)));
380 }
381 }
382 out
383}
384
385pub struct IndexBuilder<'a> {
386 root: &'a Path,
387 dir: Option<PathBuf>,
388}
389
390impl<'a> IndexBuilder<'a> {
391 #[must_use]
392 pub const fn new(root: &'a Path) -> Self {
393 Self { root, dir: None }
394 }
395
396 #[must_use]
397 pub fn with_dir(mut self, dir: impl Into<PathBuf>) -> Self {
398 self.dir = Some(dir.into());
399 self
400 }
401
402 pub fn build(self) -> crate::Result<Index> {
409 let canonical = self.root.canonicalize()?;
410 let (root, build_root) = if canonical.is_file() {
411 let parent = canonical.parent().ok_or_else(|| {
412 crate::Error::Io(std::io::Error::new(
413 std::io::ErrorKind::InvalidInput,
414 "single-file corpus must have a parent directory",
415 ))
416 })?;
417 (parent.to_path_buf(), canonical)
418 } else {
419 (canonical.clone(), canonical)
420 };
421 let (corpus_kind, tables) = build_index_tables(&build_root)?;
422
423 let files = files::MappedFilesView::from_paths(&tables.files);
424 let lexicon = crate::storage::lexicon::MappedLexicon::from_entries(&tables.lexicon);
425 let postings = crate::storage::postings::MappedPostings::from_bytes(&tables.postings);
426
427 let mut index = Index {
428 root,
429 corpus_kind,
430 files,
431 file_paths: tables.files,
432 lexicon,
433 postings,
434 index_dir: None,
435 };
436
437 if let Some(dir) = self.dir {
438 index.index_dir = Some(dir.clone());
439 index.save_to_dir(&dir)?;
440 }
441 Ok(index)
442 }
443}
444
445#[cfg(test)]
446mod tests {
447 use super::{intersect_sorted_posting_byte_slices, merge_sorted_runs};
448
449 fn bytes(ids: &[u32]) -> Vec<u8> {
450 ids.iter().flat_map(|id| id.to_le_bytes()).collect()
451 }
452
453 #[test]
454 fn merge_sorted_runs_preserves_order_and_uniqueness() {
455 let merged = merge_sorted_runs(vec![vec![1, 3, 7], vec![1, 2, 7, 9], vec![4, 7, 8]]);
456 assert_eq!(merged, vec![1, 2, 3, 4, 7, 8, 9]);
457 }
458
459 #[test]
460 fn intersect_sorted_posting_byte_slices_handles_smallest_first_order() {
461 let a = bytes(&[1, 3, 5, 7, 9]);
462 let b = bytes(&[3, 7]);
463 let c = bytes(&[0, 3, 4, 7, 8]);
464 let slices = vec![a.as_slice(), b.as_slice(), c.as_slice()];
465 let ids = intersect_sorted_posting_byte_slices(&slices);
466 assert_eq!(ids, vec![3, 7]);
467 }
468}