Skip to main content

sift_core/
lib.rs

1//! Fast indexed regex search over codebases — core engine.
2//!
3//! **Walking:** [`WalkBuilder`] from the [`ignore`] crate (ripgrep-class ignore rules).
4
5mod index;
6mod planner;
7mod search;
8mod storage;
9mod verify;
10
11pub use index::{Index, IndexBuilder, QueryPlan};
12pub use storage::{lexicon, postings};
13pub use verify::{compile_pattern, compile_search_pattern};
14
15pub use planner::TrigramPlan;
16pub use search::{walk_file_paths, CompiledSearch, Match, SearchMatchFlags, SearchOptions};
17
18pub use ignore::{Walk, WalkBuilder};
19
20pub use index::trigram::extract_trigrams;
21
22use std::path::PathBuf;
23
24use thiserror::Error;
25
26pub const META_FILENAME: &str = "sift.meta";
27pub const FILES_BIN: &str = "files.bin";
28pub const LEXICON_BIN: &str = "lexicon.bin";
29pub const POSTINGS_BIN: &str = "postings.bin";
30
31#[derive(Debug, Error)]
32pub enum Error {
33    #[error("IO error: {0}")]
34    Io(#[from] std::io::Error),
35
36    #[error("ignore walk error: {0}")]
37    Ignore(#[from] ignore::Error),
38
39    #[error("regex error: {0}")]
40    Regex(#[from] Box<regex_automata::meta::BuildError>),
41
42    #[error("search patterns must not be empty")]
43    EmptyPatterns,
44
45    #[error("invalid index metadata: {0}")]
46    InvalidMeta(PathBuf),
47
48    #[error("index not initialized (missing {0})")]
49    MissingMeta(PathBuf),
50
51    #[error("index component missing: {0}")]
52    MissingComponent(PathBuf),
53}
54
55pub type Result<T> = std::result::Result<T, Error>;
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60    use std::fs;
61
62    #[test]
63    fn build_open_search_finds_line() {
64        let tmp = std::env::temp_dir().join(format!("sift-core-test-{}", std::process::id()));
65        let _ = fs::remove_dir_all(&tmp);
66        fs::create_dir_all(tmp.join("src")).unwrap();
67        fs::write(tmp.join("src/lib.rs"), "fn hello() {\n  let x = 1;\n}\n").unwrap();
68
69        let idx = tmp.join(".index");
70        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
71
72        let index = Index::open(&idx).unwrap();
73        assert!(!index.lexicon.is_empty());
74        let pat = vec![r"let\s+x".to_string()];
75        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
76        let hits = q.search_index(&index).unwrap();
77        assert_eq!(hits.len(), 1);
78        assert!(hits[0].file.ends_with("src/lib.rs"));
79        assert_eq!(hits[0].line, 2);
80    }
81
82    #[test]
83    fn open_missing_meta_errors() {
84        let tmp = std::env::temp_dir().join(format!("sift-missing-meta-{}", std::process::id()));
85        let _ = fs::remove_dir_all(&tmp);
86        fs::create_dir_all(&tmp).unwrap();
87        assert!(matches!(Index::open(&tmp), Err(Error::MissingMeta(_))));
88    }
89
90    #[test]
91    fn open_missing_table_errors() {
92        let tmp = std::env::temp_dir().join(format!("sift-missing-table-{}", std::process::id()));
93        let _ = fs::remove_dir_all(&tmp);
94        fs::create_dir_all(&tmp).unwrap();
95        fs::write(tmp.join(META_FILENAME), "/tmp/foo\n").unwrap();
96        assert!(matches!(Index::open(&tmp), Err(Error::MissingComponent(_))));
97    }
98
99    #[test]
100    fn open_empty_meta_errors() {
101        let tmp = std::env::temp_dir().join(format!("sift-empty-meta-{}", std::process::id()));
102        let _ = fs::remove_dir_all(&tmp);
103        fs::create_dir_all(&tmp).unwrap();
104        fs::write(tmp.join(META_FILENAME), "").unwrap();
105        assert!(matches!(Index::open(&tmp), Err(Error::InvalidMeta(_))));
106    }
107
108    #[test]
109    fn explain_returns_naive_plan() {
110        let tmp = std::env::temp_dir().join(format!("sift-explain-{}", std::process::id()));
111        let _ = fs::remove_dir_all(&tmp);
112        fs::create_dir_all(&tmp).unwrap();
113        let idx = tmp.join(".index");
114        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
115        let index = Index::open(&idx).unwrap();
116        let plan = index.explain("foo.*");
117        assert_eq!(plan.pattern, "foo.*");
118        assert_eq!(plan.mode, "full_scan");
119    }
120
121    #[test]
122    fn indexed_search_matches_naive_for_literal() {
123        let tmp = std::env::temp_dir().join(format!("sift-idx-parity-{}", std::process::id()));
124        let _ = fs::remove_dir_all(&tmp);
125        fs::create_dir_all(tmp.join("a")).unwrap();
126        fs::create_dir_all(tmp.join("b")).unwrap();
127        fs::write(tmp.join("a/x.txt"), "alpha beta\n").unwrap();
128        fs::write(tmp.join("b/y.txt"), "gamma delta\n").unwrap();
129
130        let idx = tmp.join(".index");
131        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
132        let index = Index::open(&idx).unwrap();
133
134        let pat = vec!["beta".to_string()];
135        let opts = SearchOptions::default();
136        let q = CompiledSearch::new(&pat, opts).unwrap();
137        let naive = q.search_walk(&tmp, None).unwrap();
138        let indexed = q.search_index(&index).unwrap();
139        assert_eq!(indexed, naive);
140    }
141
142    #[test]
143    fn full_scan_parallel_candidate_path_finds_all_files() {
144        let tmp = std::env::temp_dir().join(format!("sift-parallel-fs-{}", std::process::id()));
145        let _ = fs::remove_dir_all(&tmp);
146        fs::create_dir_all(tmp.join("d")).unwrap();
147
148        let min_parallel = crate::search::parallel_candidate_min_files();
149        let n_files = if min_parallel == usize::MAX {
150            3
151        } else {
152            min_parallel.clamp(2, 64)
153        };
154        for i in 0..n_files {
155            fs::write(
156                tmp.join("d").join(format!("f{i}.txt")),
157                format!("line {i} needle\n"),
158            )
159            .unwrap();
160        }
161        let idx = tmp.join(".index");
162        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
163        let index = Index::open(&idx).unwrap();
164        assert_eq!(index.files.len(), n_files);
165
166        let pat = vec!["needle".to_string()];
167        let opts = SearchOptions::default();
168        let q = CompiledSearch::new(&pat, opts).unwrap();
169        let hits = q.search_index(&index).unwrap();
170        assert_eq!(hits.len(), n_files);
171    }
172
173    #[test]
174    fn full_scan_uses_files_bin_same_hits_as_fresh_walk() {
175        let tmp = std::env::temp_dir().join(format!("sift-fullscan-parity-{}", std::process::id()));
176        let _ = fs::remove_dir_all(&tmp);
177        fs::create_dir_all(tmp.join("keep")).unwrap();
178        fs::write(tmp.join("keep/a.txt"), "one\ntwo beta\n").unwrap();
179        fs::write(tmp.join("keep/b.txt"), "three\n").unwrap();
180        fs::write(tmp.join(".ignore"), "ignored\n").unwrap();
181        fs::create_dir_all(tmp.join("ignored")).unwrap();
182        fs::write(tmp.join("ignored/hidden.txt"), "beta skip\n").unwrap();
183
184        let idx = tmp.join(".index");
185        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
186        let index = Index::open(&idx).unwrap();
187
188        let pat = vec![".*".to_string()];
189        let opts = SearchOptions::default();
190        let q = CompiledSearch::new(&pat, opts).unwrap();
191        let mut from_index = q.search_index(&index).unwrap();
192        let mut from_walk = q.search_walk(&tmp, None).unwrap();
193        from_index.sort_by(|a, b| (&a.file, a.line, &a.text).cmp(&(&b.file, b.line, &b.text)));
194        from_walk.sort_by(|a, b| (&a.file, a.line, &a.text).cmp(&(&b.file, b.line, &b.text)));
195        assert_eq!(from_index, from_walk);
196    }
197}