1use crate::bloom::BloomFilter;
8use crate::decompress::maybe_decompress;
9use crate::error::Result;
10use crate::format::*;
11use crate::posting::{PostingEntry, PostingList};
12use crate::string_pool::StringPool;
13use crate::trigram::{Extractor, Trigram};
14use ignore::WalkBuilder;
15use memmap2::Mmap;
16use std::collections::HashMap;
17use std::fs::{self, File};
18use std::io::{BufWriter, Seek, SeekFrom, Write};
19use std::path::{Path, PathBuf};
20use std::time::{Instant, SystemTime, UNIX_EPOCH};
21
22pub struct Builder {
23 root: PathBuf,
24 files: Vec<FileRecord>,
25 postings: HashMap<Trigram, Vec<PostingEntry>>,
26 string_pool: StringPool,
27 extractor: Extractor,
28 stats: BuildStats,
29 decompress: bool,
30}
31
32pub struct FileRecord {
33 pub file_id: u32,
34 pub path: PathBuf,
35 pub mtime_ns: u64,
36 pub size_bytes: u64,
37 pub content_hash: u64,
38 pub trigram_count: u32,
39 pub is_binary: bool,
40 pub trigrams: Vec<(Trigram, Vec<u32>)>,
41 pub bloom: BloomFilter,
42}
43
44#[derive(Default, Debug)]
45pub struct BuildStats {
46 pub files_scanned: u64,
47 pub files_skipped_binary: u64,
48 pub files_skipped_size: u64,
49 pub bytes_scanned: u64,
50 pub unique_trigrams: u64,
51}
52
53impl Builder {
54 pub fn new(root: &Path) -> Self {
55 Self {
56 root: root.to_owned(),
57 files: Vec::new(),
58 postings: HashMap::new(),
59 string_pool: StringPool::new(),
60 extractor: Extractor::new(),
61 stats: BuildStats::default(),
62 decompress: false,
63 }
64 }
65
66 pub fn set_decompress(&mut self, decompress: bool) {
67 self.decompress = decompress;
68 }
69
70 pub fn build(&mut self) -> Result<PathBuf> {
71 let start = Instant::now();
72
73 let file_paths = self.discover_files()?;
75
76 let mut next_id = 0u32;
78 for path in file_paths {
79 if self.process_file(next_id, path)? {
80 next_id += 1;
81 }
82 }
83
84 let output_path = self.serialize()?;
86
87 tracing::info!("Build completed in {:?}: {:?}", start.elapsed(), self.stats);
88
89 Ok(output_path)
90 }
91
92 pub fn update(&mut self, changed_files: &[PathBuf]) -> Result<PathBuf> {
93 let start = Instant::now();
94
95 if self.files.is_empty() {
96 return self.build();
97 }
98
99 let changed_set: std::collections::HashSet<_> = changed_files.iter().collect();
100
101 let old_files = std::mem::take(&mut self.files);
102 self.postings = HashMap::new();
103 self.string_pool = StringPool::new();
104
105 let mut next_id = 0u32;
106 for mut record in old_files {
107 if changed_set.contains(&record.path) {
108 if self.process_file(next_id, record.path.clone())? {
110 next_id += 1;
111 }
112 } else if record.path.exists() {
113 self.string_pool.add_path(&record.path);
115 for (tri, offsets) in &record.trigrams {
116 self.postings.entry(*tri).or_default().push(PostingEntry {
117 file_id: next_id,
118 offsets: offsets.clone(),
119 });
120 }
121 record.file_id = next_id;
122 self.files.push(record);
123 next_id += 1;
124 }
125 }
127
128 let mut to_add = Vec::new();
130 {
131 let existing_paths: std::collections::HashSet<_> =
132 self.files.iter().map(|f| &f.path).collect();
133 for path in changed_files {
134 if !existing_paths.contains(path) && path.exists() {
135 to_add.push(path.clone());
136 }
137 }
138 }
139
140 for path in to_add {
141 if self.process_file(next_id, path)? {
142 next_id += 1;
143 }
144 }
145
146 let output_path = self.serialize()?;
147 tracing::info!("Incremental update completed in {:?}", start.elapsed());
148 Ok(output_path)
149 }
150
151 pub fn files_len(&self) -> usize {
152 self.files.len()
153 }
154
155 pub fn trigrams_len(&self) -> usize {
156 self.postings.len()
157 }
158
159 fn discover_files(&mut self) -> Result<Vec<PathBuf>> {
160 let mut paths = Vec::new();
161
162 let walker = WalkBuilder::new(&self.root)
163 .hidden(false)
164 .git_ignore(true)
165 .require_git(false)
166 .add_custom_ignore_filename(".ixignore")
167 .filter_entry(move |entry| {
168 let path = entry.path();
169 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
170
171 if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
173 && (name == "lost+found" || name == ".git" || name == "node_modules" ||
174 name == "target" || name == "__pycache__" || name == ".tox" ||
175 name == ".venv" || name == "venv" || name == ".ix")
176 {
177 return false;
178 }
179
180 if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
182 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
183 match ext {
184 "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
186 "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
188 "zip" | "7z" | "rar" |
190 "sqlite" | "db" | "bin" => return false,
192 _ => {}
193 }
194 if name.ends_with(".tar.gz") {
196 return false;
197 }
198 }
199 true
200 })
201 .build();
202
203 for result in walker {
204 match result {
205 Ok(entry) => {
206 if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
207 paths.push(entry.path().to_owned());
208 }
209 }
210 Err(e) => {
211 eprintln!("ix: warning: skipping path: {}", e);
212 }
213 }
214 }
215 Ok(paths)
216 }
217
218 fn process_file(&mut self, file_id: u32, path: PathBuf) -> Result<bool> {
219 let metadata = fs::metadata(&path)?;
220 let size = metadata.len();
221 let mtime = metadata
222 .modified()?
223 .duration_since(UNIX_EPOCH)
224 .map(|d| d.as_nanos() as u64)
225 .unwrap_or(0);
226
227 if size > 100 * 1024 * 1024 {
228 self.stats.files_skipped_size += 1;
230 return Ok(false);
231 }
232
233 let file = File::open(&path)?;
234 let mmap = unsafe { Mmap::map(&file)? };
235
236 let raw_data = if self.decompress {
238 if let Some(mut reader) = maybe_decompress(&path, &mmap)? {
239 let mut buf = Vec::new();
240 use std::io::Read;
244 reader.read_to_end(&mut buf)?; std::borrow::Cow::Owned(buf)
249 } else {
250 std::borrow::Cow::Borrowed(&mmap[..])
251 }
252 } else {
253 std::borrow::Cow::Borrowed(&mmap[..])
254 };
255
256 let data = &raw_data[..];
257
258 if is_binary(data) {
260 self.stats.files_skipped_binary += 1;
261 return Ok(false);
262 }
263
264 let content_hash = xxhash_rust::xxh64::xxh64(data, 0);
265 let pairs = self.extractor.extract_with_offsets(data);
266
267 self.string_pool.add_path(&path);
268 let mut bloom = BloomFilter::new(256, 5);
269 let mut trigram_count = 0;
270 let mut trigrams = Vec::with_capacity(pairs.len().min(4096));
271
272 let mut i = 0;
274 while i < pairs.len() {
275 let tri = pairs[i].0;
276 let mut j = i + 1;
277 while j < pairs.len() && pairs[j].0 == tri {
278 j += 1;
279 }
280
281 let offsets: Vec<u32> = pairs[i..j].iter().map(|p| p.1).collect();
283
284 bloom.insert(tri);
285 self.postings.entry(tri).or_default().push(PostingEntry {
286 file_id,
287 offsets: offsets.clone(),
288 });
289
290 trigrams.push((tri, offsets));
291 trigram_count += 1;
292 i = j;
293 }
294
295 self.files.push(FileRecord {
296 file_id,
297 path,
298 mtime_ns: mtime,
299 size_bytes: size,
300 content_hash,
301 trigram_count,
302 is_binary: false,
303 trigrams,
304 bloom,
305 });
306
307 self.stats.files_scanned += 1;
308 self.stats.bytes_scanned += size;
309
310 Ok(true)
311 }
312
313 fn serialize(&mut self) -> Result<PathBuf> {
314 let ix_dir = self.root.join(".ix");
315 fs::create_dir_all(&ix_dir)?;
316 let tmp_path = ix_dir.join("shard.ix.tmp");
317 let final_path = ix_dir.join("shard.ix");
318
319 let mut f = BufWriter::new(File::create(&tmp_path)?);
320
321 f.write_all(&[0u8; HEADER_SIZE])?;
323
324 let mut string_pool_buf = std::io::Cursor::new(Vec::new());
326 self.string_pool.serialize(&mut string_pool_buf)?;
327 let string_pool_data = string_pool_buf.into_inner();
328
329 let file_table_offset = self.align_to_8(&mut f)?;
331 for record in &self.files {
332 let (path_off, path_len) = self.string_pool.get_info(&record.path);
333 f.write_all(&record.file_id.to_le_bytes())?;
334 f.write_all(&path_off.to_le_bytes())?;
335 f.write_all(&path_len.to_le_bytes())?;
336 f.write_all(&[FileStatus::Fresh as u8])?;
337 f.write_all(&[0u8])?; f.write_all(&record.mtime_ns.to_le_bytes())?;
339 f.write_all(&record.size_bytes.to_le_bytes())?;
340 f.write_all(&record.content_hash.to_le_bytes())?;
341 f.write_all(&record.trigram_count.to_le_bytes())?;
342 f.write_all(&0u32.to_le_bytes())?; f.write_all(&[0u8; 4])?; }
345 let file_table_size = f.stream_position()? - file_table_offset;
346
347 self.align_to_8(&mut f)?;
349 let posting_data_offset = f.stream_position()?;
350 let mut posting_infos = HashMap::new();
351
352 let mut sorted_trigrams: Vec<Trigram> = self.postings.keys().cloned().collect();
353 sorted_trigrams.sort_unstable();
354
355 for &tri in &sorted_trigrams {
356 let entries = &self.postings[&tri];
357 let list = PostingList {
358 entries: entries.clone(),
359 };
360 let encoded = list.encode();
361 let offset = f.stream_position()? - posting_data_offset;
362 f.write_all(&encoded)?;
363 posting_infos.insert(tri, (offset, encoded.len() as u32, entries.len() as u32));
364 }
365 let posting_data_size = f.stream_position()? - posting_data_offset;
366
367 self.align_to_8(&mut f)?;
369 let trigram_table_offset = f.stream_position()?;
370 for &tri in &sorted_trigrams {
371 if let Some(&(off, len, freq)) = posting_infos.get(&tri) {
372 let abs_off = posting_data_offset + off;
373 f.write_all(&tri.to_le_bytes())?; f.write_all(&abs_off.to_le_bytes()[..6])?; f.write_all(&len.to_le_bytes())?; f.write_all(&freq.to_le_bytes())?; f.write_all(&[0u8; 2])?; }
379 }
380 let trigram_table_size = f.stream_position()? - trigram_table_offset;
381
382 self.align_to_8(&mut f)?;
384 let bloom_offset = f.stream_position()?;
385 let mut bloom_relative_offsets = Vec::new();
386 for record in &self.files {
387 let rel_off = (f.stream_position()? - bloom_offset) as u32;
388 bloom_relative_offsets.push(rel_off);
389 record.bloom.serialize(&mut f)?;
390 }
391 let bloom_size = f.stream_position()? - bloom_offset;
392
393 self.align_to_8(&mut f)?;
395 let string_pool_offset = f.stream_position()?;
396 f.write_all(&string_pool_data)?;
397 let string_pool_size = string_pool_data.len() as u64;
398
399 let name_index_offset = f.stream_position()?;
401 let name_index_size = 0u64;
402
403 f.seek(SeekFrom::Start(file_table_offset))?;
405 for (i, _record) in self.files.iter().enumerate() {
406 f.seek(SeekFrom::Current(40))?; f.write_all(&bloom_relative_offsets[i].to_le_bytes())?;
408 f.seek(SeekFrom::Current(4))?; }
410
411 let created_at = SystemTime::now()
413 .duration_since(UNIX_EPOCH)
414 .unwrap()
415 .as_micros() as u64;
416 let mut header_bytes = [0u8; HEADER_SIZE];
417 header_bytes[0..4].copy_from_slice(&MAGIC);
418 header_bytes[0x04..0x06].copy_from_slice(&VERSION_MAJOR.to_le_bytes());
419 header_bytes[0x06..0x08].copy_from_slice(&VERSION_MINOR.to_le_bytes());
420 header_bytes[0x08..0x10].copy_from_slice(
421 &(flags::HAS_BLOOM_FILTERS
422 | flags::HAS_CONTENT_HASHES
423 | flags::POSTING_LISTS_CHECKSUMMED)
424 .to_le_bytes(),
425 );
426 header_bytes[0x10..0x18].copy_from_slice(&created_at.to_le_bytes());
427 header_bytes[0x18..0x20].copy_from_slice(&self.stats.bytes_scanned.to_le_bytes());
428 header_bytes[0x20..0x24].copy_from_slice(&(self.files.len() as u32).to_le_bytes());
429 header_bytes[0x24..0x28].copy_from_slice(&(sorted_trigrams.len() as u32).to_le_bytes());
430 header_bytes[0x28..0x30].copy_from_slice(&file_table_offset.to_le_bytes());
431 header_bytes[0x30..0x38].copy_from_slice(&file_table_size.to_le_bytes());
432 header_bytes[0x38..0x40].copy_from_slice(&trigram_table_offset.to_le_bytes());
433 header_bytes[0x40..0x48].copy_from_slice(&trigram_table_size.to_le_bytes());
434 header_bytes[0x48..0x50].copy_from_slice(&posting_data_offset.to_le_bytes());
435 header_bytes[0x50..0x58].copy_from_slice(&posting_data_size.to_le_bytes());
436 header_bytes[0x58..0x60].copy_from_slice(&bloom_offset.to_le_bytes());
437 header_bytes[0x60..0x68].copy_from_slice(&bloom_size.to_le_bytes());
438 header_bytes[0x68..0x70].copy_from_slice(&string_pool_offset.to_le_bytes());
439 header_bytes[0x70..0x78].copy_from_slice(&string_pool_size.to_le_bytes());
440 header_bytes[0x78..0x80].copy_from_slice(&name_index_offset.to_le_bytes());
441 header_bytes[0x80..0x88].copy_from_slice(&name_index_size.to_le_bytes());
442
443 let crc = crc32c::crc32c(&header_bytes[0..0xF8]);
444 header_bytes[0xF8..0xFC].copy_from_slice(&crc.to_le_bytes());
445
446 f.seek(SeekFrom::Start(0))?;
447 f.write_all(&header_bytes)?;
448 f.flush()?;
449 drop(f);
450
451 fs::rename(&tmp_path, &final_path)?;
452 Ok(final_path)
453 }
454
455 fn align_to_8<W: Write + Seek>(&self, mut w: W) -> std::io::Result<u64> {
456 let pos = w.stream_position()?;
457 let padding = (8 - (pos % 8)) % 8;
458 if padding > 0 {
459 w.write_all(&vec![0u8; padding as usize])?;
460 }
461 w.stream_position()
462 }
463}