Skip to main content

sift_core/
lib.rs

1//! Fast indexed regex search over codebases — core engine.
2//!
3//! **Walking:** [`WalkBuilder`] from the [`ignore`] crate (ripgrep-class ignore rules).
4
5mod index;
6mod planner;
7mod search;
8pub mod storage;
9mod verify;
10
11pub use index::{Index, IndexBuilder, QueryPlan};
12pub use storage::{lexicon, postings};
13pub use verify::{compile_pattern, compile_search_pattern};
14
15pub use planner::TrigramPlan;
16pub use search::{
17    walk_file_paths, CompiledSearch, Match, SearchMatchFlags, SearchMode, SearchOptions,
18    SearchOutput,
19};
20
21pub use ignore::{Walk, WalkBuilder};
22
23pub use index::trigram::extract_trigrams;
24
25use std::path::PathBuf;
26
27use thiserror::Error;
28
29pub const SIFT_DIR: &str = ".sift";
30pub const INDEX_SUBDIR: &str = ".index";
31pub const META_FILENAME: &str = "sift.meta";
32pub const FILES_BIN: &str = "files.bin";
33pub const LEXICON_BIN: &str = "lexicon.bin";
34pub const POSTINGS_BIN: &str = "postings.bin";
35
36#[derive(Debug, Error)]
37pub enum Error {
38    #[error("IO error: {0}")]
39    Io(#[from] std::io::Error),
40
41    #[error("ignore walk error: {0}")]
42    Ignore(#[from] ignore::Error),
43
44    #[error("regex error: {0}")]
45    Regex(#[from] Box<regex_automata::meta::BuildError>),
46
47    #[error("regex build error: {0}")]
48    RegexBuild(String),
49
50    #[error("search patterns must not be empty")]
51    EmptyPatterns,
52
53    #[error("invalid index metadata: {0}")]
54    InvalidMeta(PathBuf),
55
56    #[error("index not initialized (missing {0})")]
57    MissingMeta(PathBuf),
58
59    #[error("index component missing: {0}")]
60    MissingComponent(PathBuf),
61}
62
63pub type Result<T> = std::result::Result<T, Error>;
64
65#[cfg(test)]
66mod tests {
67    use super::*;
68    use std::fs;
69
70    fn normalized_path(p: &std::path::Path) -> std::path::PathBuf {
71        let s = p.display().to_string();
72        #[cfg(windows)]
73        let s = s.strip_prefix("\\\\?\\").unwrap_or(&s).to_string();
74        #[cfg(target_os = "macos")]
75        let s = s.replace("/private", "");
76        std::path::PathBuf::from(s)
77    }
78
79    #[test]
80    fn build_open_search_finds_line() {
81        let tmp = std::env::temp_dir().join(format!("sift-core-test-{}", std::process::id()));
82        let _ = fs::remove_dir_all(&tmp);
83        fs::create_dir_all(tmp.join("src")).unwrap();
84        fs::write(tmp.join("src/lib.rs"), "fn hello() {\n  let x = 1;\n}\n").unwrap();
85
86        let idx = tmp.join(".sift");
87        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
88
89        let index = Index::open(&idx).unwrap();
90        assert!(index.file_count() > 0);
91        let pat = vec![r"let\s+x".to_string()];
92        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
93        let hits = q.collect_index_matches(&index).unwrap();
94        assert_eq!(hits.len(), 1);
95        assert!(hits[0].file.ends_with("src/lib.rs"));
96        assert_eq!(hits[0].line, 2);
97    }
98
99    #[test]
100    fn open_missing_meta_errors() {
101        let tmp = std::env::temp_dir().join(format!("sift-missing-meta-{}", std::process::id()));
102        let _ = fs::remove_dir_all(&tmp);
103        fs::create_dir_all(&tmp).unwrap();
104        assert!(matches!(Index::open(&tmp), Err(Error::MissingMeta(_))));
105    }
106
107    #[test]
108    fn open_missing_table_errors() {
109        let tmp = std::env::temp_dir().join(format!("sift-missing-table-{}", std::process::id()));
110        let _ = fs::remove_dir_all(&tmp);
111        fs::create_dir_all(&tmp).unwrap();
112        let root_path = std::env::temp_dir().join("sift-test-root");
113        let meta = crate::index::IndexMeta {
114            root: root_path,
115            kind: crate::index::CorpusKind::Directory,
116        };
117        fs::write(
118            tmp.join(META_FILENAME),
119            serde_json::to_string_pretty(&meta).unwrap(),
120        )
121        .unwrap();
122        assert!(matches!(Index::open(&tmp), Err(Error::MissingComponent(_))));
123    }
124
125    #[test]
126    fn open_empty_meta_errors() {
127        let tmp = std::env::temp_dir().join(format!("sift-empty-meta-{}", std::process::id()));
128        let _ = fs::remove_dir_all(&tmp);
129        fs::create_dir_all(&tmp).unwrap();
130        fs::write(tmp.join(META_FILENAME), "").unwrap();
131        assert!(matches!(Index::open(&tmp), Err(Error::InvalidMeta(_))));
132    }
133
134    #[test]
135    fn explain_returns_indexed_plan_for_literal_prefix() {
136        let tmp = std::env::temp_dir().join(format!("sift-explain-indexed-{}", std::process::id()));
137        let _ = fs::remove_dir_all(&tmp);
138        fs::create_dir_all(&tmp).unwrap();
139        fs::write(tmp.join("a.txt"), "alpha beta\ngamma delta\n").unwrap();
140        let idx = tmp.join(".sift");
141        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
142        let index = Index::open(&idx).unwrap();
143        let plan = index.explain("foo.*");
144        assert_eq!(plan.pattern, "foo.*");
145        assert_eq!(plan.mode, "indexed_candidates");
146    }
147
148    #[test]
149    fn explain_returns_full_scan_for_true_no_literal() {
150        let tmp =
151            std::env::temp_dir().join(format!("sift-explain-fullscan-{}", std::process::id()));
152        let _ = fs::remove_dir_all(&tmp);
153        fs::create_dir_all(&tmp).unwrap();
154        fs::write(tmp.join("a.txt"), "alpha beta\ngamma delta\n").unwrap();
155        let idx = tmp.join(".sift");
156        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
157        let index = Index::open(&idx).unwrap();
158        let plan = index.explain(r"\w{5}\s+\w{5}");
159        assert_eq!(plan.pattern, r"\w{5}\s+\w{5}");
160        assert_eq!(plan.mode, "full_scan");
161    }
162
163    #[test]
164    fn indexed_search_matches_naive_for_literal() {
165        let tmp = std::env::temp_dir().join(format!("sift-idx-parity-{}", std::process::id()));
166        let _ = fs::remove_dir_all(&tmp);
167        fs::create_dir_all(tmp.join("a")).unwrap();
168        fs::create_dir_all(tmp.join("b")).unwrap();
169        fs::write(tmp.join("a/x.txt"), "alpha beta\n").unwrap();
170        fs::write(tmp.join("b/y.txt"), "gamma delta\n").unwrap();
171
172        let idx = tmp.join(".sift");
173        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
174        let index = Index::open(&idx).unwrap();
175
176        let pat = vec!["beta".to_string()];
177        let opts = SearchOptions::default();
178        let q = CompiledSearch::new(&pat, opts).unwrap();
179        let naive = q.collect_walk_matches(&tmp).unwrap();
180        let indexed = q.collect_index_matches(&index).unwrap();
181        assert_eq!(indexed, naive);
182    }
183
184    #[test]
185    fn full_scan_parallel_candidate_path_finds_all_files() {
186        let tmp = std::env::temp_dir().join(format!("sift-parallel-fs-{}", std::process::id()));
187        let _ = fs::remove_dir_all(&tmp);
188        fs::create_dir_all(tmp.join("d")).unwrap();
189
190        let min_parallel = crate::search::parallel_candidate_min_files();
191        let n_files = if min_parallel == usize::MAX {
192            3
193        } else {
194            min_parallel.clamp(2, 64)
195        };
196        for i in 0..n_files {
197            fs::write(
198                tmp.join("d").join(format!("f{i}.txt")),
199                format!("line {i} needle\n"),
200            )
201            .unwrap();
202        }
203        let idx = tmp.join(".sift");
204        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
205        let index = Index::open(&idx).unwrap();
206        assert_eq!(index.file_count(), n_files);
207
208        let pat = vec!["needle".to_string()];
209        let opts = SearchOptions::default();
210        let q = CompiledSearch::new(&pat, opts).unwrap();
211        let hits = q.collect_index_matches(&index).unwrap();
212        assert_eq!(hits.len(), n_files);
213    }
214
215    #[test]
216    fn full_scan_uses_files_bin_same_hits_as_fresh_walk() {
217        let tmp = std::env::temp_dir().join(format!("sift-fullscan-parity-{}", std::process::id()));
218        let _ = fs::remove_dir_all(&tmp);
219        fs::create_dir_all(tmp.join("keep")).unwrap();
220        fs::write(tmp.join("keep/a.txt"), "one\ntwo beta\n").unwrap();
221        fs::write(tmp.join("keep/b.txt"), "three\n").unwrap();
222        fs::write(tmp.join(".ignore"), "ignored\n").unwrap();
223        fs::create_dir_all(tmp.join("ignored")).unwrap();
224        fs::write(tmp.join("ignored/hidden.txt"), "beta skip\n").unwrap();
225
226        let idx = tmp.join(".sift");
227        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
228        let index = Index::open(&idx).unwrap();
229
230        let pat = vec![".*".to_string()];
231        let opts = SearchOptions::default();
232        let q = CompiledSearch::new(&pat, opts).unwrap();
233        let mut from_index = q.collect_index_matches(&index).unwrap();
234        let mut from_walk = q.collect_walk_matches(&tmp).unwrap();
235        from_index.sort_by(|a, b| (&a.file, a.line, &a.text).cmp(&(&b.file, b.line, &b.text)));
236        from_walk.sort_by(|a, b| (&a.file, a.line, &a.text).cmp(&(&b.file, b.line, &b.text)));
237        assert_eq!(from_index, from_walk);
238    }
239
240    #[test]
241    fn build_open_single_file_search_finds_line() {
242        let tmp = std::env::temp_dir().join(format!("sift-single-file-{}", std::process::id()));
243        let _ = fs::remove_dir_all(&tmp);
244        fs::create_dir_all(&tmp).unwrap();
245        let file = tmp.join("one.txt");
246        fs::write(&file, "alpha\nbeta needle\n").unwrap();
247
248        let idx = tmp.join(".sift");
249        let _ = IndexBuilder::new(&file).with_dir(&idx).build().unwrap();
250        let index = Index::open(&idx).unwrap();
251
252        let expected_root = file.canonicalize().unwrap().parent().unwrap().to_path_buf();
253        assert_eq!(
254            normalized_path(&index.root),
255            normalized_path(&expected_root)
256        );
257        assert!(matches!(index.corpus_kind, index::CorpusKind::File { .. }));
258        assert_eq!(index.file_count(), 1);
259        assert_eq!(index.file_path(0).unwrap(), std::path::Path::new("one.txt"));
260
261        let pat = vec!["needle".to_string()];
262        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
263        let hits = q.collect_index_matches(&index).unwrap();
264        assert_eq!(hits.len(), 1);
265        assert_eq!(
266            normalized_path(&hits[0].file),
267            normalized_path(&file.canonicalize().unwrap())
268        );
269        assert_eq!(hits[0].line, 2);
270    }
271
272    #[test]
273    fn single_file_meta_is_json_with_explicit_kind() {
274        let tmp =
275            std::env::temp_dir().join(format!("sift-single-file-meta-{}", std::process::id()));
276        let _ = fs::remove_dir_all(&tmp);
277        fs::create_dir_all(&tmp).unwrap();
278        let file = tmp.join("one.txt");
279        fs::write(&file, "alpha\n").unwrap();
280
281        let idx = tmp.join(".sift");
282        let _ = IndexBuilder::new(&file).with_dir(&idx).build().unwrap();
283        let meta = fs::read_to_string(idx.join(META_FILENAME)).unwrap();
284
285        assert!(
286            meta.contains("\"kind\": \"file\""),
287            "unexpected meta: {meta}"
288        );
289        assert!(meta.contains("\"entries\""), "unexpected meta: {meta}");
290        assert!(meta.contains("\"one.txt\""), "unexpected meta: {meta}");
291    }
292}