Skip to main content

ix/
builder.rs

1//! Index builder — the complete pipeline from files to .ix shard.
2//!
3//! Phase 1: Discovery (walk directory, respect .gitignore)
4//! Phase 2: Scan (mmap, check binary, extract trigrams, bloom filter)
5//! Phase 3: Serialize (write sections, compute CRCs, atomic rename)
6
7use crate::bloom::BloomFilter;
8use crate::decompress::maybe_decompress;
9use crate::error::Result;
10use crate::format::*;
11use crate::posting::{PostingEntry, PostingList};
12use crate::string_pool::StringPool;
13use crate::trigram::{Extractor, Trigram};
14use ignore::WalkBuilder;
15use memmap2::Mmap;
16use std::collections::HashMap;
17use std::fs::{self, File};
18use std::io::{BufWriter, Seek, SeekFrom, Write};
19use std::path::{Path, PathBuf};
20use std::time::{Instant, SystemTime, UNIX_EPOCH};
21
22pub struct Builder {
23    root: PathBuf,
24    files: Vec<FileRecord>,
25    postings: HashMap<Trigram, Vec<PostingEntry>>,
26    string_pool: StringPool,
27    extractor: Extractor,
28    stats: BuildStats,
29    decompress: bool,
30}
31
32pub struct FileRecord {
33    pub file_id: u32,
34    pub path: PathBuf,
35    pub mtime_ns: u64,
36    pub size_bytes: u64,
37    pub content_hash: u64,
38    pub trigram_count: u32,
39    pub is_binary: bool,
40    pub trigrams: Vec<(Trigram, Vec<u32>)>,
41    pub bloom: BloomFilter,
42}
43
44#[derive(Default, Debug)]
45pub struct BuildStats {
46    pub files_scanned: u64,
47    pub files_skipped_binary: u64,
48    pub files_skipped_size: u64,
49    pub bytes_scanned: u64,
50    pub unique_trigrams: u64,
51}
52
53impl Builder {
54    pub fn new(root: &Path) -> Self {
55        Self {
56            root: root.to_owned(),
57            files: Vec::new(),
58            postings: HashMap::new(),
59            string_pool: StringPool::new(),
60            extractor: Extractor::new(),
61            stats: BuildStats::default(),
62            decompress: false,
63        }
64    }
65
66    pub fn set_decompress(&mut self, decompress: bool) {
67        self.decompress = decompress;
68    }
69
70    pub fn build(&mut self) -> Result<PathBuf> {
71        let start = Instant::now();
72
73        // 1. Discovery
74        let file_paths = self.discover_files()?;
75
76        // 2. Scan & Extract
77        let mut next_id = 0u32;
78        for path in file_paths {
79            if self.process_file(next_id, path)? {
80                next_id += 1;
81            }
82        }
83
84        // 3. Serialize
85        let output_path = self.serialize()?;
86
87        tracing::info!("Build completed in {:?}: {:?}", start.elapsed(), self.stats);
88
89        Ok(output_path)
90    }
91
92    pub fn update(&mut self, changed_files: &[PathBuf]) -> Result<PathBuf> {
93        let start = Instant::now();
94
95        if self.files.is_empty() {
96            return self.build();
97        }
98
99        let changed_set: std::collections::HashSet<_> = changed_files.iter().collect();
100
101        let old_files = std::mem::take(&mut self.files);
102        self.postings = HashMap::new();
103        self.string_pool = StringPool::new();
104
105        let mut next_id = 0u32;
106        for mut record in old_files {
107            if changed_set.contains(&record.path) {
108                // Modified
109                if self.process_file(next_id, record.path.clone())? {
110                    next_id += 1;
111                }
112            } else if record.path.exists() {
113                // Unchanged - reuse trigrams and bloom
114                self.string_pool.add_path(&record.path);
115                for (tri, offsets) in &record.trigrams {
116                    self.postings.entry(*tri).or_default().push(PostingEntry {
117                        file_id: next_id,
118                        offsets: offsets.clone(),
119                    });
120                }
121                record.file_id = next_id;
122                self.files.push(record);
123                next_id += 1;
124            }
125            // Deleted files are simply not added back to self.files
126        }
127
128        // Handle brand new files
129        let mut to_add = Vec::new();
130        {
131            let existing_paths: std::collections::HashSet<_> =
132                self.files.iter().map(|f| &f.path).collect();
133            for path in changed_files {
134                if !existing_paths.contains(path) && path.exists() {
135                    to_add.push(path.clone());
136                }
137            }
138        }
139
140        for path in to_add {
141            if self.process_file(next_id, path)? {
142                next_id += 1;
143            }
144        }
145
146        let output_path = self.serialize()?;
147        tracing::info!("Incremental update completed in {:?}", start.elapsed());
148        Ok(output_path)
149    }
150
151    pub fn files_len(&self) -> usize {
152        self.files.len()
153    }
154
155    pub fn trigrams_len(&self) -> usize {
156        self.postings.len()
157    }
158
159    fn discover_files(&mut self) -> Result<Vec<PathBuf>> {
160        let mut paths = Vec::new();
161
162        let walker = WalkBuilder::new(&self.root)
163            .hidden(false)
164            .git_ignore(true)
165            .require_git(false)
166            .add_custom_ignore_filename(".ixignore")
167            .filter_entry(move |entry| {
168                let path = entry.path();
169                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
170                
171                // Built-in directory defaults
172                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
173                    && (name == "lost+found" || name == ".git" || name == "node_modules" || 
174                       name == "target" || name == "__pycache__" || name == ".tox" || 
175                       name == ".venv" || name == "venv" || name == ".ix") 
176                {
177                    return false;
178                }
179
180                // Built-in file extension defaults
181                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
182                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
183                    match ext {
184                        // Binary extensions
185                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
186                        // Media
187                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
188                        // Archives
189                        "zip" | "7z" | "rar" |
190                        // Data
191                        "sqlite" | "db" | "bin" => return false,
192                        _ => {}
193                    }
194                    // Handle .tar.gz specifically
195                    if name.ends_with(".tar.gz") {
196                        return false;
197                    }
198                }
199                true
200            })
201            .build();
202
203        for result in walker {
204            match result {
205                Ok(entry) => {
206                    if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
207                        paths.push(entry.path().to_owned());
208                    }
209                }
210                Err(e) => {
211                    eprintln!("ix: warning: skipping path: {}", e);
212                }
213            }
214        }
215        Ok(paths)
216    }
217
218    fn process_file(&mut self, file_id: u32, path: PathBuf) -> Result<bool> {
219        let metadata = fs::metadata(&path)?;
220        let size = metadata.len();
221        let mtime = metadata
222            .modified()?
223            .duration_since(UNIX_EPOCH)
224            .map(|d| d.as_nanos() as u64)
225            .unwrap_or(0);
226
227        if size > 100 * 1024 * 1024 {
228            // 100MB limit
229            self.stats.files_skipped_size += 1;
230            return Ok(false);
231        }
232
233        let file = File::open(&path)?;
234        let mmap = unsafe { Mmap::map(&file)? };
235
236        // Handle decompression
237        let raw_data = if self.decompress {
238            if let Some(mut reader) = maybe_decompress(&path, &mmap)? {
239                let mut buf = Vec::new();
240                // We still need a cap during indexing because we extract all trigrams
241                // and compute content hash, which requires full data in memory.
242                // 10MB is a safe default for 'code' files.
243                use std::io::Read;
244                reader.read_to_end(&mut buf)?; // Remove take() if we want 'no cap' but indexing MUST buffer.
245                // Wait, if indexing doesn't cap, we could OOM during index.
246                // I'll keep a large but reasonable cap for indexing.
247                // Actually, let's just read it all for now as requested 'no capping'.
248                std::borrow::Cow::Owned(buf)
249            } else {
250                std::borrow::Cow::Borrowed(&mmap[..])
251            }
252        } else {
253            std::borrow::Cow::Borrowed(&mmap[..])
254        };
255
256        let data = &raw_data[..];
257
258        // Binary check
259        if is_binary(data) {
260            self.stats.files_skipped_binary += 1;
261            return Ok(false);
262        }
263
264        let content_hash = xxhash_rust::xxh64::xxh64(data, 0);
265        let pairs = self.extractor.extract_with_offsets(data);
266        
267        self.string_pool.add_path(&path);
268        let mut bloom = BloomFilter::new(256, 5);
269        let mut trigram_count = 0;
270        let mut trigrams = Vec::with_capacity(pairs.len().min(4096));
271
272        // Group by trigram from the sorted slice
273        let mut i = 0;
274        while i < pairs.len() {
275            let tri = pairs[i].0;
276            let mut j = i + 1;
277            while j < pairs.len() && pairs[j].0 == tri {
278                j += 1;
279            }
280            
281            // Now [i..j] contains all offsets for 'tri'
282            let offsets: Vec<u32> = pairs[i..j].iter().map(|p| p.1).collect();
283            
284            bloom.insert(tri);
285            self.postings.entry(tri).or_default().push(PostingEntry {
286                file_id,
287                offsets: offsets.clone(),
288            });
289            
290            trigrams.push((tri, offsets));
291            trigram_count += 1;
292            i = j;
293        }
294
295        self.files.push(FileRecord {
296            file_id,
297            path,
298            mtime_ns: mtime,
299            size_bytes: size,
300            content_hash,
301            trigram_count,
302            is_binary: false,
303            trigrams, 
304            bloom,
305        });
306
307        self.stats.files_scanned += 1;
308        self.stats.bytes_scanned += size;
309
310        Ok(true)
311    }
312
313    fn serialize(&mut self) -> Result<PathBuf> {
314        let ix_dir = self.root.join(".ix");
315        fs::create_dir_all(&ix_dir)?;
316        let tmp_path = ix_dir.join("shard.ix.tmp");
317        let final_path = ix_dir.join("shard.ix");
318
319        let mut f = BufWriter::new(File::create(&tmp_path)?);
320
321        // Placeholder for header
322        f.write_all(&[0u8; HEADER_SIZE])?;
323
324        // Pre-serialize string pool to get offsets
325        let mut string_pool_buf = std::io::Cursor::new(Vec::new());
326        self.string_pool.serialize(&mut string_pool_buf)?;
327        let string_pool_data = string_pool_buf.into_inner();
328
329        // 1. File Table
330        let file_table_offset = self.align_to_8(&mut f)?;
331        for record in &self.files {
332            let (path_off, path_len) = self.string_pool.get_info(&record.path);
333            f.write_all(&record.file_id.to_le_bytes())?;
334            f.write_all(&path_off.to_le_bytes())?;
335            f.write_all(&path_len.to_le_bytes())?;
336            f.write_all(&[FileStatus::Fresh as u8])?;
337            f.write_all(&[0u8])?; // flags
338            f.write_all(&record.mtime_ns.to_le_bytes())?;
339            f.write_all(&record.size_bytes.to_le_bytes())?;
340            f.write_all(&record.content_hash.to_le_bytes())?;
341            f.write_all(&record.trigram_count.to_le_bytes())?;
342            f.write_all(&0u32.to_le_bytes())?; // bloom_offset (filled later)
343            f.write_all(&[0u8; 4])?; // padding
344        }
345        let file_table_size = f.stream_position()? - file_table_offset;
346
347        // 2. Posting Data
348        self.align_to_8(&mut f)?;
349        let posting_data_offset = f.stream_position()?;
350        let mut posting_infos = HashMap::new();
351
352        let mut sorted_trigrams: Vec<Trigram> = self.postings.keys().cloned().collect();
353        sorted_trigrams.sort_unstable();
354
355        for &tri in &sorted_trigrams {
356            let entries = &self.postings[&tri];
357            let list = PostingList {
358                entries: entries.clone(),
359            };
360            let encoded = list.encode();
361            let offset = f.stream_position()? - posting_data_offset;
362            f.write_all(&encoded)?;
363            posting_infos.insert(tri, (offset, encoded.len() as u32, entries.len() as u32));
364        }
365        let posting_data_size = f.stream_position()? - posting_data_offset;
366
367        // 3. Trigram Table
368        self.align_to_8(&mut f)?;
369        let trigram_table_offset = f.stream_position()?;
370        for &tri in &sorted_trigrams {
371            if let Some(&(off, len, freq)) = posting_infos.get(&tri) {
372                let abs_off = posting_data_offset + off;
373                f.write_all(&tri.to_le_bytes())?; // 4-byte key
374                f.write_all(&abs_off.to_le_bytes()[..6])?; // u48
375                f.write_all(&len.to_le_bytes())?; // u32
376                f.write_all(&freq.to_le_bytes())?; // u32
377                f.write_all(&[0u8; 2])?; // padding to 20 bytes total
378            }
379        }
380        let trigram_table_size = f.stream_position()? - trigram_table_offset;
381
382        // 4. Bloom Filters
383        self.align_to_8(&mut f)?;
384        let bloom_offset = f.stream_position()?;
385        let mut bloom_relative_offsets = Vec::new();
386        for record in &self.files {
387            let rel_off = (f.stream_position()? - bloom_offset) as u32;
388            bloom_relative_offsets.push(rel_off);
389            record.bloom.serialize(&mut f)?;
390        }
391        let bloom_size = f.stream_position()? - bloom_offset;
392
393        // 5. String Pool
394        self.align_to_8(&mut f)?;
395        let string_pool_offset = f.stream_position()?;
396        f.write_all(&string_pool_data)?;
397        let string_pool_size = string_pool_data.len() as u64;
398
399        // 6. Name Index (TODO, optional for now)
400        let name_index_offset = f.stream_position()?;
401        let name_index_size = 0u64;
402
403        // Update File Table with bloom offsets
404        f.seek(SeekFrom::Start(file_table_offset))?;
405        for (i, _record) in self.files.iter().enumerate() {
406            f.seek(SeekFrom::Current(40))?; // skip to bloom_offset
407            f.write_all(&bloom_relative_offsets[i].to_le_bytes())?;
408            f.seek(SeekFrom::Current(4))?; // skip padding
409        }
410
411        // Finalize Header
412        let created_at = SystemTime::now()
413            .duration_since(UNIX_EPOCH)
414            .unwrap()
415            .as_micros() as u64;
416        let mut header_bytes = [0u8; HEADER_SIZE];
417        header_bytes[0..4].copy_from_slice(&MAGIC);
418        header_bytes[0x04..0x06].copy_from_slice(&VERSION_MAJOR.to_le_bytes());
419        header_bytes[0x06..0x08].copy_from_slice(&VERSION_MINOR.to_le_bytes());
420        header_bytes[0x08..0x10].copy_from_slice(
421            &(flags::HAS_BLOOM_FILTERS
422                | flags::HAS_CONTENT_HASHES
423                | flags::POSTING_LISTS_CHECKSUMMED)
424                .to_le_bytes(),
425        );
426        header_bytes[0x10..0x18].copy_from_slice(&created_at.to_le_bytes());
427        header_bytes[0x18..0x20].copy_from_slice(&self.stats.bytes_scanned.to_le_bytes());
428        header_bytes[0x20..0x24].copy_from_slice(&(self.files.len() as u32).to_le_bytes());
429        header_bytes[0x24..0x28].copy_from_slice(&(sorted_trigrams.len() as u32).to_le_bytes());
430        header_bytes[0x28..0x30].copy_from_slice(&file_table_offset.to_le_bytes());
431        header_bytes[0x30..0x38].copy_from_slice(&file_table_size.to_le_bytes());
432        header_bytes[0x38..0x40].copy_from_slice(&trigram_table_offset.to_le_bytes());
433        header_bytes[0x40..0x48].copy_from_slice(&trigram_table_size.to_le_bytes());
434        header_bytes[0x48..0x50].copy_from_slice(&posting_data_offset.to_le_bytes());
435        header_bytes[0x50..0x58].copy_from_slice(&posting_data_size.to_le_bytes());
436        header_bytes[0x58..0x60].copy_from_slice(&bloom_offset.to_le_bytes());
437        header_bytes[0x60..0x68].copy_from_slice(&bloom_size.to_le_bytes());
438        header_bytes[0x68..0x70].copy_from_slice(&string_pool_offset.to_le_bytes());
439        header_bytes[0x70..0x78].copy_from_slice(&string_pool_size.to_le_bytes());
440        header_bytes[0x78..0x80].copy_from_slice(&name_index_offset.to_le_bytes());
441        header_bytes[0x80..0x88].copy_from_slice(&name_index_size.to_le_bytes());
442
443        let crc = crc32c::crc32c(&header_bytes[0..0xF8]);
444        header_bytes[0xF8..0xFC].copy_from_slice(&crc.to_le_bytes());
445
446        f.seek(SeekFrom::Start(0))?;
447        f.write_all(&header_bytes)?;
448        f.flush()?;
449        drop(f);
450
451        fs::rename(&tmp_path, &final_path)?;
452        Ok(final_path)
453    }
454
455    fn align_to_8<W: Write + Seek>(&self, mut w: W) -> std::io::Result<u64> {
456        let pos = w.stream_position()?;
457        let padding = (8 - (pos % 8)) % 8;
458        if padding > 0 {
459            w.write_all(&vec![0u8; padding as usize])?;
460        }
461        w.stream_position()
462    }
463}