Skip to main content

sift_core/
lib.rs

1//! Fast indexed regex search over codebases — core engine.
2//!
3//! **Walking:** [`WalkBuilder`] from the [`ignore`] crate (ripgrep-class ignore rules).
4
5mod index;
6mod planner;
7mod search;
8pub mod storage;
9mod verify;
10
11pub use index::{CorpusKind, Index, IndexBuilder, QueryPlan};
12pub use storage::{lexicon, postings};
13pub use verify::{compile_pattern, compile_search_pattern};
14
15pub use planner::TrigramPlan;
16pub use search::{
17    walk_file_paths, CandidateInfo, CaseMode, CompiledSearch, FilenameMode, GlobConfig, HiddenMode,
18    IgnoreConfig, IgnoreSources, Match, OutputEmission, SearchFilter, SearchFilterConfig,
19    SearchMatchFlags, SearchMode, SearchOptions, SearchOutput, VisibilityConfig,
20};
21
22pub use ignore::{Walk, WalkBuilder};
23
24pub use index::trigram::extract_trigrams;
25
26use std::path::PathBuf;
27
28use thiserror::Error;
29
30pub const SIFT_DIR: &str = ".sift";
31pub const INDEX_SUBDIR: &str = ".index";
32pub const META_FILENAME: &str = "sift.meta";
33pub const FILES_BIN: &str = "files.bin";
34pub const LEXICON_BIN: &str = "lexicon.bin";
35pub const POSTINGS_BIN: &str = "postings.bin";
36
37#[derive(Debug, Error)]
38pub enum Error {
39    #[error("IO error: {0}")]
40    Io(#[from] std::io::Error),
41
42    #[error("ignore walk error: {0}")]
43    Ignore(#[from] ignore::Error),
44
45    #[error("regex error: {0}")]
46    Regex(#[from] Box<regex_automata::meta::BuildError>),
47
48    #[error("regex build error: {0}")]
49    RegexBuild(String),
50
51    #[error("search patterns must not be empty")]
52    EmptyPatterns,
53
54    #[error("invalid max-count: 0 matches requested")]
55    InvalidMaxCount,
56
57    #[error("invalid index metadata: {0}")]
58    InvalidMeta(PathBuf),
59
60    #[error("index not initialized (missing {0})")]
61    MissingMeta(PathBuf),
62
63    #[error("index component missing: {0}")]
64    MissingComponent(PathBuf),
65}
66
67pub type Result<T> = std::result::Result<T, Error>;
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72    use std::fs;
73
74    fn normalized_path(p: &std::path::Path) -> std::path::PathBuf {
75        let s = p.display().to_string();
76        #[cfg(windows)]
77        let s = s.strip_prefix("\\\\?\\").unwrap_or(&s).to_string();
78        #[cfg(target_os = "macos")]
79        let s = s.replace("/private", "");
80        std::path::PathBuf::from(s)
81    }
82
83    #[test]
84    fn build_open_search_finds_line() {
85        let tmp = std::env::temp_dir().join(format!("sift-core-test-{}", std::process::id()));
86        let _ = fs::remove_dir_all(&tmp);
87        fs::create_dir_all(tmp.join("src")).unwrap();
88        fs::write(tmp.join("src/lib.rs"), "fn hello() {\n  let x = 1;\n}\n").unwrap();
89
90        let idx = tmp.join(".sift");
91        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
92
93        let index = Index::open(&idx).unwrap();
94        assert!(index.file_count() > 0);
95        let pat = vec![r"let\s+x".to_string()];
96        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
97        let hits = q.collect_index_matches(&index).unwrap();
98        assert_eq!(hits.len(), 1);
99        assert!(hits[0].file.ends_with("src/lib.rs"));
100        assert_eq!(hits[0].line, 2);
101    }
102
103    #[test]
104    fn open_missing_meta_errors() {
105        let tmp = std::env::temp_dir().join(format!("sift-missing-meta-{}", std::process::id()));
106        let _ = fs::remove_dir_all(&tmp);
107        fs::create_dir_all(&tmp).unwrap();
108        assert!(matches!(Index::open(&tmp), Err(Error::MissingMeta(_))));
109    }
110
111    #[test]
112    fn open_missing_table_errors() {
113        let tmp = std::env::temp_dir().join(format!("sift-missing-table-{}", std::process::id()));
114        let _ = fs::remove_dir_all(&tmp);
115        fs::create_dir_all(&tmp).unwrap();
116        let root_path = std::env::temp_dir().join("sift-test-root");
117        let meta = crate::index::IndexMeta {
118            root: root_path,
119            kind: crate::index::CorpusKind::Directory,
120        };
121        fs::write(
122            tmp.join(META_FILENAME),
123            serde_json::to_string_pretty(&meta).unwrap(),
124        )
125        .unwrap();
126        assert!(matches!(Index::open(&tmp), Err(Error::MissingComponent(_))));
127    }
128
129    #[test]
130    fn open_empty_meta_errors() {
131        let tmp = std::env::temp_dir().join(format!("sift-empty-meta-{}", std::process::id()));
132        let _ = fs::remove_dir_all(&tmp);
133        fs::create_dir_all(&tmp).unwrap();
134        fs::write(tmp.join(META_FILENAME), "").unwrap();
135        assert!(matches!(Index::open(&tmp), Err(Error::InvalidMeta(_))));
136    }
137
138    #[test]
139    fn explain_returns_indexed_plan_for_literal_prefix() {
140        let tmp = std::env::temp_dir().join(format!("sift-explain-indexed-{}", std::process::id()));
141        let _ = fs::remove_dir_all(&tmp);
142        fs::create_dir_all(&tmp).unwrap();
143        fs::write(tmp.join("a.txt"), "alpha beta\ngamma delta\n").unwrap();
144        let idx = tmp.join(".sift");
145        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
146        let index = Index::open(&idx).unwrap();
147        let plan = index.explain("foo.*");
148        assert_eq!(plan.pattern, "foo.*");
149        assert_eq!(plan.mode, "indexed_candidates");
150    }
151
152    #[test]
153    fn explain_returns_full_scan_for_true_no_literal() {
154        let tmp =
155            std::env::temp_dir().join(format!("sift-explain-fullscan-{}", std::process::id()));
156        let _ = fs::remove_dir_all(&tmp);
157        fs::create_dir_all(&tmp).unwrap();
158        fs::write(tmp.join("a.txt"), "alpha beta\ngamma delta\n").unwrap();
159        let idx = tmp.join(".sift");
160        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
161        let index = Index::open(&idx).unwrap();
162        let plan = index.explain(r"\w{5}\s+\w{5}");
163        assert_eq!(plan.pattern, r"\w{5}\s+\w{5}");
164        assert_eq!(plan.mode, "full_scan");
165    }
166
167    #[test]
168    fn indexed_search_matches_naive_for_literal() {
169        let tmp = std::env::temp_dir().join(format!("sift-idx-parity-{}", std::process::id()));
170        let _ = fs::remove_dir_all(&tmp);
171        fs::create_dir_all(tmp.join("a")).unwrap();
172        fs::create_dir_all(tmp.join("b")).unwrap();
173        fs::write(tmp.join("a/x.txt"), "alpha beta\n").unwrap();
174        fs::write(tmp.join("b/y.txt"), "gamma delta\n").unwrap();
175
176        let idx = tmp.join(".sift");
177        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
178        let index = Index::open(&idx).unwrap();
179
180        let pat = vec!["beta".to_string()];
181        let opts = SearchOptions::default();
182        let q = CompiledSearch::new(&pat, opts).unwrap();
183        let naive = q.collect_walk_matches(&tmp).unwrap();
184        let indexed = q.collect_index_matches(&index).unwrap();
185        assert_eq!(indexed, naive);
186    }
187
188    #[test]
189    fn full_scan_parallel_candidate_path_finds_all_files() {
190        let tmp = std::env::temp_dir().join(format!("sift-parallel-fs-{}", std::process::id()));
191        let _ = fs::remove_dir_all(&tmp);
192        fs::create_dir_all(tmp.join("d")).unwrap();
193
194        let min_parallel = crate::search::parallel_candidate_min_files();
195        let n_files = if min_parallel == usize::MAX {
196            3
197        } else {
198            min_parallel.clamp(2, 64)
199        };
200        for i in 0..n_files {
201            fs::write(
202                tmp.join("d").join(format!("f{i}.txt")),
203                format!("line {i} needle\n"),
204            )
205            .unwrap();
206        }
207        let idx = tmp.join(".sift");
208        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
209        let index = Index::open(&idx).unwrap();
210        assert_eq!(index.file_count(), n_files);
211
212        let pat = vec!["needle".to_string()];
213        let opts = SearchOptions::default();
214        let q = CompiledSearch::new(&pat, opts).unwrap();
215        let hits = q.collect_index_matches(&index).unwrap();
216        assert_eq!(hits.len(), n_files);
217    }
218
219    #[test]
220    fn full_scan_uses_files_bin_same_hits_as_fresh_walk() {
221        let tmp = std::env::temp_dir().join(format!("sift-fullscan-parity-{}", std::process::id()));
222        let _ = fs::remove_dir_all(&tmp);
223        fs::create_dir_all(tmp.join("keep")).unwrap();
224        fs::write(tmp.join("keep/a.txt"), "one\ntwo beta\n").unwrap();
225        fs::write(tmp.join("keep/b.txt"), "three\n").unwrap();
226        fs::write(tmp.join(".ignore"), "ignored\n").unwrap();
227        fs::create_dir_all(tmp.join("ignored")).unwrap();
228        fs::write(tmp.join("ignored/hidden.txt"), "beta skip\n").unwrap();
229
230        let idx = tmp.join(".sift");
231        let _ = IndexBuilder::new(&tmp).with_dir(&idx).build().unwrap();
232        let index = Index::open(&idx).unwrap();
233
234        let pat = vec![".*".to_string()];
235        let opts = SearchOptions::default();
236        let q = CompiledSearch::new(&pat, opts).unwrap();
237        let mut from_index = q.collect_index_matches(&index).unwrap();
238        let mut from_walk = q.collect_walk_matches(&tmp).unwrap();
239        from_index.sort_by(|a, b| (&a.file, a.line, &a.text).cmp(&(&b.file, b.line, &b.text)));
240        from_walk.sort_by(|a, b| (&a.file, a.line, &a.text).cmp(&(&b.file, b.line, &b.text)));
241        assert_eq!(from_index, from_walk);
242    }
243
244    #[test]
245    fn build_open_single_file_search_finds_line() {
246        let tmp = std::env::temp_dir().join(format!("sift-single-file-{}", std::process::id()));
247        let _ = fs::remove_dir_all(&tmp);
248        fs::create_dir_all(&tmp).unwrap();
249        let file = tmp.join("one.txt");
250        fs::write(&file, "alpha\nbeta needle\n").unwrap();
251
252        let idx = tmp.join(".sift");
253        let _ = IndexBuilder::new(&file).with_dir(&idx).build().unwrap();
254        let index = Index::open(&idx).unwrap();
255
256        let expected_root = file.canonicalize().unwrap().parent().unwrap().to_path_buf();
257        assert_eq!(
258            normalized_path(&index.root),
259            normalized_path(&expected_root)
260        );
261        assert!(matches!(index.corpus_kind, index::CorpusKind::File { .. }));
262        assert_eq!(index.file_count(), 1);
263        assert_eq!(index.file_path(0).unwrap(), std::path::Path::new("one.txt"));
264
265        let pat = vec!["needle".to_string()];
266        let q = CompiledSearch::new(&pat, SearchOptions::default()).unwrap();
267        let hits = q.collect_index_matches(&index).unwrap();
268        assert_eq!(hits.len(), 1);
269        assert_eq!(
270            normalized_path(&hits[0].file),
271            normalized_path(&file.canonicalize().unwrap())
272        );
273        assert_eq!(hits[0].line, 2);
274    }
275
276    #[test]
277    fn single_file_meta_is_json_with_explicit_kind() {
278        let tmp =
279            std::env::temp_dir().join(format!("sift-single-file-meta-{}", std::process::id()));
280        let _ = fs::remove_dir_all(&tmp);
281        fs::create_dir_all(&tmp).unwrap();
282        let file = tmp.join("one.txt");
283        fs::write(&file, "alpha\n").unwrap();
284
285        let idx = tmp.join(".sift");
286        let _ = IndexBuilder::new(&file).with_dir(&idx).build().unwrap();
287        let meta = fs::read_to_string(idx.join(META_FILENAME)).unwrap();
288
289        assert!(
290            meta.contains("\"kind\": \"file\""),
291            "unexpected meta: {meta}"
292        );
293        assert!(meta.contains("\"entries\""), "unexpected meta: {meta}");
294        assert!(meta.contains("\"one.txt\""), "unexpected meta: {meta}");
295    }
296}