use crate::bloom::BloomFilter;
use crate::decompress::maybe_decompress;
use crate::error::Result;
use crate::format::*;
use crate::posting::{PostingEntry, PostingList};
use crate::string_pool::StringPool;
use crate::trigram::{Extractor, Trigram};
use ignore::WalkBuilder;
use memmap2::Mmap;
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{BufWriter, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::time::{Instant, SystemTime, UNIX_EPOCH};
pub struct Builder {
root: PathBuf,
files: Vec<FileRecord>,
postings: HashMap<Trigram, Vec<PostingEntry>>,
string_pool: StringPool,
extractor: Extractor,
stats: BuildStats,
decompress: bool,
}
pub struct FileRecord {
pub file_id: u32,
pub path: PathBuf,
pub mtime_ns: u64,
pub size_bytes: u64,
pub content_hash: u64,
pub trigram_count: u32,
pub is_binary: bool,
pub trigrams: Vec<(Trigram, Vec<u32>)>,
pub bloom: BloomFilter,
}
#[derive(Default, Debug)]
pub struct BuildStats {
pub files_scanned: u64,
pub files_skipped_binary: u64,
pub files_skipped_size: u64,
pub bytes_scanned: u64,
pub unique_trigrams: u64,
}
impl Builder {
pub fn new(root: &Path) -> Self {
Self {
root: root.to_owned(),
files: Vec::new(),
postings: HashMap::new(),
string_pool: StringPool::new(),
extractor: Extractor::new(),
stats: BuildStats::default(),
decompress: false,
}
}
pub fn set_decompress(&mut self, decompress: bool) {
self.decompress = decompress;
}
pub fn build(&mut self) -> Result<PathBuf> {
let start = Instant::now();
let file_paths = self.discover_files()?;
let mut next_id = 0u32;
for path in file_paths {
if self.process_file(next_id, path)? {
next_id += 1;
}
}
let output_path = self.serialize()?;
tracing::info!("Build completed in {:?}: {:?}", start.elapsed(), self.stats);
Ok(output_path)
}
pub fn update(&mut self, changed_files: &[PathBuf]) -> Result<PathBuf> {
let start = Instant::now();
if self.files.is_empty() {
return self.build();
}
let changed_set: std::collections::HashSet<_> = changed_files.iter().collect();
let old_files = std::mem::take(&mut self.files);
self.postings = HashMap::new();
self.string_pool = StringPool::new();
let mut next_id = 0u32;
for mut record in old_files {
if changed_set.contains(&record.path) {
if self.process_file(next_id, record.path.clone())? {
next_id += 1;
}
} else if record.path.exists() {
self.string_pool.add_path(&record.path);
for (tri, offsets) in &record.trigrams {
self.postings.entry(*tri).or_default().push(PostingEntry {
file_id: next_id,
offsets: offsets.clone(),
});
}
record.file_id = next_id;
self.files.push(record);
next_id += 1;
}
}
let mut to_add = Vec::new();
{
let existing_paths: std::collections::HashSet<_> =
self.files.iter().map(|f| &f.path).collect();
for path in changed_files {
if !existing_paths.contains(path) && path.exists() {
to_add.push(path.clone());
}
}
}
for path in to_add {
if self.process_file(next_id, path)? {
next_id += 1;
}
}
let output_path = self.serialize()?;
tracing::info!("Incremental update completed in {:?}", start.elapsed());
Ok(output_path)
}
pub fn files_len(&self) -> usize {
self.files.len()
}
pub fn trigrams_len(&self) -> usize {
self.postings.len()
}
fn discover_files(&mut self) -> Result<Vec<PathBuf>> {
let mut paths = Vec::new();
let walker = WalkBuilder::new(&self.root)
.hidden(false)
.git_ignore(true)
.require_git(false)
.add_custom_ignore_filename(".ixignore")
.filter_entry(move |entry| {
let path = entry.path();
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
&& (name == "lost+found" || name == ".git" || name == "node_modules" ||
name == "target" || name == "__pycache__" || name == ".tox" ||
name == ".venv" || name == "venv" || name == ".ix")
{
return false;
}
if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
match ext {
"so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
"jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
"zip" | "7z" | "rar" |
"sqlite" | "db" | "bin" => return false,
_ => {}
}
if name.ends_with(".tar.gz") {
return false;
}
}
true
})
.build();
for result in walker {
match result {
Ok(entry) => {
if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
paths.push(entry.path().to_owned());
}
}
Err(e) => {
eprintln!("ix: warning: skipping path: {}", e);
}
}
}
Ok(paths)
}
fn process_file(&mut self, file_id: u32, path: PathBuf) -> Result<bool> {
let metadata = fs::metadata(&path)?;
let size = metadata.len();
let mtime = metadata
.modified()?
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos() as u64)
.unwrap_or(0);
if size > 100 * 1024 * 1024 {
self.stats.files_skipped_size += 1;
return Ok(false);
}
let file = File::open(&path)?;
let mmap = unsafe { Mmap::map(&file)? };
let raw_data = if self.decompress {
if let Some(mut reader) = maybe_decompress(&path, &mmap)? {
let mut buf = Vec::new();
use std::io::Read;
reader.read_to_end(&mut buf)?; std::borrow::Cow::Owned(buf)
} else {
std::borrow::Cow::Borrowed(&mmap[..])
}
} else {
std::borrow::Cow::Borrowed(&mmap[..])
};
let data = &raw_data[..];
if is_binary(data) {
self.stats.files_skipped_binary += 1;
return Ok(false);
}
let content_hash = xxhash_rust::xxh64::xxh64(data, 0);
let pairs = self.extractor.extract_with_offsets(data);
self.string_pool.add_path(&path);
let mut bloom = BloomFilter::new(256, 5);
let mut trigram_count = 0;
let mut trigrams = Vec::with_capacity(pairs.len().min(4096));
let mut i = 0;
while i < pairs.len() {
let tri = pairs[i].0;
let mut j = i + 1;
while j < pairs.len() && pairs[j].0 == tri {
j += 1;
}
let offsets: Vec<u32> = pairs[i..j].iter().map(|p| p.1).collect();
bloom.insert(tri);
self.postings.entry(tri).or_default().push(PostingEntry {
file_id,
offsets: offsets.clone(),
});
trigrams.push((tri, offsets));
trigram_count += 1;
i = j;
}
self.files.push(FileRecord {
file_id,
path,
mtime_ns: mtime,
size_bytes: size,
content_hash,
trigram_count,
is_binary: false,
trigrams,
bloom,
});
self.stats.files_scanned += 1;
self.stats.bytes_scanned += size;
Ok(true)
}
fn serialize(&mut self) -> Result<PathBuf> {
let ix_dir = self.root.join(".ix");
fs::create_dir_all(&ix_dir)?;
let tmp_path = ix_dir.join("shard.ix.tmp");
let final_path = ix_dir.join("shard.ix");
let mut f = BufWriter::new(File::create(&tmp_path)?);
f.write_all(&[0u8; HEADER_SIZE])?;
let mut string_pool_buf = std::io::Cursor::new(Vec::new());
self.string_pool.serialize(&mut string_pool_buf)?;
let string_pool_data = string_pool_buf.into_inner();
let file_table_offset = self.align_to_8(&mut f)?;
for record in &self.files {
let (path_off, path_len) = self.string_pool.get_info(&record.path);
f.write_all(&record.file_id.to_le_bytes())?;
f.write_all(&path_off.to_le_bytes())?;
f.write_all(&path_len.to_le_bytes())?;
f.write_all(&[FileStatus::Fresh as u8])?;
f.write_all(&[0u8])?; f.write_all(&record.mtime_ns.to_le_bytes())?;
f.write_all(&record.size_bytes.to_le_bytes())?;
f.write_all(&record.content_hash.to_le_bytes())?;
f.write_all(&record.trigram_count.to_le_bytes())?;
f.write_all(&0u32.to_le_bytes())?; f.write_all(&[0u8; 4])?; }
let file_table_size = f.stream_position()? - file_table_offset;
self.align_to_8(&mut f)?;
let posting_data_offset = f.stream_position()?;
let mut posting_infos = HashMap::new();
let mut sorted_trigrams: Vec<Trigram> = self.postings.keys().cloned().collect();
sorted_trigrams.sort_unstable();
for &tri in &sorted_trigrams {
let entries = &self.postings[&tri];
let list = PostingList {
entries: entries.clone(),
};
let encoded = list.encode();
let offset = f.stream_position()? - posting_data_offset;
f.write_all(&encoded)?;
posting_infos.insert(tri, (offset, encoded.len() as u32, entries.len() as u32));
}
let posting_data_size = f.stream_position()? - posting_data_offset;
self.align_to_8(&mut f)?;
let trigram_table_offset = f.stream_position()?;
for &tri in &sorted_trigrams {
if let Some(&(off, len, freq)) = posting_infos.get(&tri) {
let abs_off = posting_data_offset + off;
f.write_all(&tri.to_le_bytes())?; f.write_all(&abs_off.to_le_bytes()[..6])?; f.write_all(&len.to_le_bytes())?; f.write_all(&freq.to_le_bytes())?; f.write_all(&[0u8; 2])?; }
}
let trigram_table_size = f.stream_position()? - trigram_table_offset;
self.align_to_8(&mut f)?;
let bloom_offset = f.stream_position()?;
let mut bloom_relative_offsets = Vec::new();
for record in &self.files {
let rel_off = (f.stream_position()? - bloom_offset) as u32;
bloom_relative_offsets.push(rel_off);
record.bloom.serialize(&mut f)?;
}
let bloom_size = f.stream_position()? - bloom_offset;
self.align_to_8(&mut f)?;
let string_pool_offset = f.stream_position()?;
f.write_all(&string_pool_data)?;
let string_pool_size = string_pool_data.len() as u64;
let name_index_offset = f.stream_position()?;
let name_index_size = 0u64;
f.seek(SeekFrom::Start(file_table_offset))?;
for (i, _record) in self.files.iter().enumerate() {
f.seek(SeekFrom::Current(40))?; f.write_all(&bloom_relative_offsets[i].to_le_bytes())?;
f.seek(SeekFrom::Current(4))?; }
let created_at = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_micros() as u64;
let mut header_bytes = [0u8; HEADER_SIZE];
header_bytes[0..4].copy_from_slice(&MAGIC);
header_bytes[0x04..0x06].copy_from_slice(&VERSION_MAJOR.to_le_bytes());
header_bytes[0x06..0x08].copy_from_slice(&VERSION_MINOR.to_le_bytes());
header_bytes[0x08..0x10].copy_from_slice(
&(flags::HAS_BLOOM_FILTERS
| flags::HAS_CONTENT_HASHES
| flags::POSTING_LISTS_CHECKSUMMED)
.to_le_bytes(),
);
header_bytes[0x10..0x18].copy_from_slice(&created_at.to_le_bytes());
header_bytes[0x18..0x20].copy_from_slice(&self.stats.bytes_scanned.to_le_bytes());
header_bytes[0x20..0x24].copy_from_slice(&(self.files.len() as u32).to_le_bytes());
header_bytes[0x24..0x28].copy_from_slice(&(sorted_trigrams.len() as u32).to_le_bytes());
header_bytes[0x28..0x30].copy_from_slice(&file_table_offset.to_le_bytes());
header_bytes[0x30..0x38].copy_from_slice(&file_table_size.to_le_bytes());
header_bytes[0x38..0x40].copy_from_slice(&trigram_table_offset.to_le_bytes());
header_bytes[0x40..0x48].copy_from_slice(&trigram_table_size.to_le_bytes());
header_bytes[0x48..0x50].copy_from_slice(&posting_data_offset.to_le_bytes());
header_bytes[0x50..0x58].copy_from_slice(&posting_data_size.to_le_bytes());
header_bytes[0x58..0x60].copy_from_slice(&bloom_offset.to_le_bytes());
header_bytes[0x60..0x68].copy_from_slice(&bloom_size.to_le_bytes());
header_bytes[0x68..0x70].copy_from_slice(&string_pool_offset.to_le_bytes());
header_bytes[0x70..0x78].copy_from_slice(&string_pool_size.to_le_bytes());
header_bytes[0x78..0x80].copy_from_slice(&name_index_offset.to_le_bytes());
header_bytes[0x80..0x88].copy_from_slice(&name_index_size.to_le_bytes());
let crc = crc32c::crc32c(&header_bytes[0..0xF8]);
header_bytes[0xF8..0xFC].copy_from_slice(&crc.to_le_bytes());
f.seek(SeekFrom::Start(0))?;
f.write_all(&header_bytes)?;
f.flush()?;
drop(f);
fs::rename(&tmp_path, &final_path)?;
Ok(final_path)
}
fn align_to_8<W: Write + Seek>(&self, mut w: W) -> std::io::Result<u64> {
let pos = w.stream_position()?;
let padding = (8 - (pos % 8)) % 8;
if padding > 0 {
w.write_all(&vec![0u8; padding as usize])?;
}
w.stream_position()
}
}