1use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::{FILE_ENTRY_SIZE, FileStatus, HEADER_SIZE, Header};
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18
19#[derive(Debug, Clone, Copy)]
21pub struct ShardMetadata {
22 pub shard_timestamp: u64,
24 pub file_count: u32,
26 pub trigram_count: u32,
28}
29
30#[derive(Debug, Clone, Copy)]
32pub struct CdxBlockEntry {
33 pub first_key: u32,
35 pub block_offset: u64,
37}
38
39pub struct Reader {
41 mmap: Mmap,
42 pub header: Header,
44 string_pool: StringPoolReader<'static>,
45 inode: Option<u64>,
46 cdx_blocks: Vec<CdxBlockEntry>,
47}
48
49#[derive(Debug)]
51pub struct TrigramInfo {
52 pub posting_offset: u64,
54 pub posting_length: u32,
56 pub doc_frequency: u32,
58}
59
60#[derive(Debug)]
62pub struct FileInfo {
63 pub file_id: u32,
65 pub path: PathBuf,
67 pub status: FileStatus,
69 pub mtime_ns: u64,
71 pub size_bytes: u64,
73 pub content_hash: u64,
75}
76
77#[allow(clippy::as_conversions)] #[allow(clippy::indexing_slicing)] impl Reader {
80 pub fn open(path: &Path) -> Result<Self> {
87 let file = File::open(path)?;
88
89 let mmap = unsafe { Mmap::map(&file)? };
93
94 if mmap.len() < HEADER_SIZE {
95 return Err(Error::IndexTooSmall);
96 }
97
98 let header = Header::parse(&mmap[0..HEADER_SIZE])?;
99 header.validate_bounds(mmap.len() as u64)?;
100
101 #[cfg(unix)]
102 let inode = Some(file.metadata()?.ino());
103
104 #[cfg(not(unix))]
105 let inode = None;
106
107 let string_pool_data: &'static [u8] = unsafe {
114 let start = header.string_pool_offset as usize;
115 let end = (header.string_pool_offset + header.string_pool_size) as usize;
116 std::mem::transmute::<&[u8], &'static [u8]>(&mmap[start..end])
117 };
118 let string_pool = StringPoolReader::new(string_pool_data)?;
119
120 let cdx_blocks = if header.has_cdx() && header.cdx_block_index_size > 0 {
121 let idx_start = header.cdx_block_index_offset as usize;
122 let idx_end = idx_start + header.cdx_block_index_size as usize;
123 let idx_data = mmap
124 .get(idx_start..idx_end)
125 .ok_or(Error::SectionOutOfBounds {
126 section: "cdx_block_index",
127 offset: header.cdx_block_index_offset,
128 size: header.cdx_block_index_size,
129 file_len: mmap.len() as u64,
130 })?;
131 let mut blocks = Vec::new();
132 let mut pos = 0;
133 while pos + 12 <= idx_data.len() {
134 let first_key = u32::from_le_bytes(
135 idx_data[pos..pos + 4]
136 .try_into()
137 .map_err(|_| Error::Config("bad cdx key".into()))?,
138 );
139 if first_key == u32::MAX {
140 break;
141 }
142 let block_offset = u64::from_le_bytes(
143 idx_data[pos + 4..pos + 12]
144 .try_into()
145 .map_err(|_| Error::Config("bad cdx offset".into()))?,
146 );
147 blocks.push(CdxBlockEntry {
148 first_key,
149 block_offset,
150 });
151 pos += 12;
152 }
153 blocks
154 } else {
155 Vec::new()
156 };
157
158 Ok(Self {
159 mmap,
160 header,
161 string_pool,
162 inode,
163 cdx_blocks,
164 })
165 }
166
167 pub fn get_last_modified(root: &Path) -> Result<u64> {
173 let mut last_modified = 0u64;
174 let walker = ignore::WalkBuilder::new(root)
175 .hidden(false)
176 .git_ignore(true)
177 .require_git(false)
178 .add_custom_ignore_filename(".ixignore")
179 .filter_entry(move |entry| {
180 let path = entry.path();
181 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
182
183 if entry.file_type().is_some_and(|t| t.is_dir())
184 && matches!(
185 name,
186 "lost+found"
187 | ".git"
188 | "node_modules"
189 | "target"
190 | "__pycache__"
191 | ".tox"
192 | ".venv"
193 | "venv"
194 | ".ix"
195 )
196 {
197 return false;
198 }
199
200 if entry.file_type().is_some_and(|t| t.is_file()) {
201 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
202 if matches!(
203 ext,
204 "so" | "o"
205 | "dylib"
206 | "a"
207 | "dll"
208 | "exe"
209 | "pyc"
210 | "jpg"
211 | "png"
212 | "gif"
213 | "mp4"
214 | "mp3"
215 | "pdf"
216 | "zip"
217 | "7z"
218 | "rar"
219 | "sqlite"
220 | "db"
221 | "bin"
222 ) || name.ends_with(".tar.gz")
223 {
224 return false;
225 }
226 }
227 true
228 })
229 .build();
230
231 for result in walker {
232 match result {
233 Ok(entry) => {
234 if entry.file_type().is_some_and(|t| t.is_file()) {
235 let metadata =
236 entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
237 let mtime = metadata
238 .modified()
239 .and_then(|t| {
240 t.duration_since(UNIX_EPOCH)
241 .map_err(|_| std::io::Error::other("time went backwards"))
242 })
243 .map_or(0, |d| d.as_micros() as u64);
244 if mtime > last_modified {
245 last_modified = mtime;
246 }
247 }
248 }
249 Err(e) => {
250 eprintln!("ix: warning: stale check skipping path: {e}");
251 }
252 }
253 }
254 Ok(last_modified)
255 }
256
257 pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
263 if self.header.has_cdx() && !self.cdx_blocks.is_empty() {
264 return self.get_trigram_cdx(trigram);
265 }
266
267 let count = self.header.trigram_count as usize;
269 let table_start = self.header.trigram_table_offset as usize;
270 let entry_size = crate::format::TRIGRAM_ENTRY_SIZE;
271
272 let mut low = 0;
273 let mut high = count;
274
275 while low < high {
276 let mid = low + (high - low) / 2;
277 let entry_off = table_start + mid * entry_size;
278
279 let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
280 let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
281
282 match key.cmp(&trigram) {
283 std::cmp::Ordering::Equal => {
284 let entry = self.mmap.get(entry_off..entry_off + entry_size)?;
285
286 let mut off_bytes = [0u8; 8];
287 off_bytes[..6].copy_from_slice(&entry[4..10]);
288 let posting_offset = u64::from_le_bytes(off_bytes);
289
290 let posting_length = entry
291 .get(10..14)
292 .and_then(|s| s.try_into().ok())
293 .map(u32::from_le_bytes)?;
294
295 let doc_frequency = entry
296 .get(14..18)
297 .and_then(|s| s.try_into().ok())
298 .map(u32::from_le_bytes)?;
299
300 return Some(TrigramInfo {
301 posting_offset,
302 posting_length,
303 doc_frequency,
304 });
305 }
306 std::cmp::Ordering::Less => low = mid + 1,
307 std::cmp::Ordering::Greater => high = mid,
308 }
309 }
310
311 None
312 }
313
314 fn get_trigram_cdx(&self, trigram: Trigram) -> Option<TrigramInfo> {
315 let mut block_idx = 0;
316 for (i, entry) in self.cdx_blocks.iter().enumerate() {
317 if entry.first_key > trigram {
318 break;
319 }
320 block_idx = i;
321 }
322
323 let block_entry = self.cdx_blocks.get(block_idx)?;
324
325 let block_end = self.cdx_blocks.get(block_idx + 1).map_or_else(
326 || self.header.trigram_table_offset + self.header.trigram_table_size,
327 |next| next.block_offset,
328 );
329
330 let block_start = block_entry.block_offset as usize;
331 let block_end = block_end as usize;
332 let block_data = self.mmap.get(block_start..block_end)?;
333
334 let decompressed = match zstd::decode_all(block_data) {
335 Ok(d) => d,
336 Err(e) => {
337 tracing::warn!("ix: CDX block decompression failed: {e}");
338 return None;
339 }
340 };
341
342 let mut pos = 0;
343 let num_entries = match crate::varint::decode(&decompressed, &mut pos) {
344 Ok(v) => usize::try_from(v).unwrap_or(0),
345 Err(e) => {
346 tracing::warn!("ix: CDX num_entries varint decode failed: {e}");
347 return None;
348 }
349 };
350
351 let mut last_key = 0u32;
352 for _ in 0..num_entries {
353 let key_delta = match crate::varint::decode(&decompressed, &mut pos) {
354 Ok(v) => u32::try_from(v).unwrap_or(0),
355 Err(e) => {
356 tracing::warn!("ix: CDX key_delta varint decode failed: {e}");
357 return None;
358 }
359 };
360 let key = last_key + key_delta;
361 last_key = key;
362
363 let posting_offset = match crate::varint::decode(&decompressed, &mut pos) { Ok(v) => v, Err(e) => { tracing::warn!("ix: CDX posting_offset varint decode failed: {e}"); return None; } };
364 let posting_length = match crate::varint::decode(&decompressed, &mut pos) {
365 Ok(v) => u32::try_from(v).unwrap_or(0),
366 Err(e) => {
367 tracing::warn!("ix: CDX posting_length varint decode failed: {e}");
368 return None;
369 }
370 };
371 let doc_frequency = match crate::varint::decode(&decompressed, &mut pos) {
372 Ok(v) => u32::try_from(v).unwrap_or(0),
373 Err(e) => {
374 tracing::warn!("ix: CDX doc_frequency varint decode failed: {e}");
375 return None;
376 }
377 };
378
379 if key == trigram {
380 return Some(TrigramInfo {
381 posting_offset,
382 posting_length,
383 doc_frequency,
384 });
385 }
386 if key > trigram {
387 break;
388 }
389 }
390
391 None
392 }
393
394 pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
400 let start = info.posting_offset as usize;
401 let end = start + info.posting_length as usize;
402 if end > self.mmap.len() {
403 return Err(Error::PostingOutOfBounds);
404 }
405 PostingList::decode(&self.mmap[start..end])
406 }
407
408 pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
415 if file_id >= self.header.file_count {
416 return Err(Error::FileIdOutOfBounds(file_id));
417 }
418
419 let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
420 let entry = self
421 .mmap
422 .get(entry_off..entry_off + FILE_ENTRY_SIZE)
423 .ok_or(Error::SectionOutOfBounds {
424 section: "file_entry",
425 offset: entry_off as u64,
426 size: FILE_ENTRY_SIZE as u64,
427 file_len: self.mmap.len() as u64,
428 })?;
429
430 let path_off = u32::from_le_bytes(
431 entry[4..8]
432 .try_into()
433 .map_err(|_| Error::Config("invalid path offset".into()))?,
434 );
435 let status = FileStatus::from_u8(entry[10]);
436 let mtime_ns = u64::from_le_bytes(
437 entry[12..20]
438 .try_into()
439 .map_err(|_| Error::Config("invalid mtime".into()))?,
440 );
441 let size_bytes = u64::from_le_bytes(
442 entry[20..28]
443 .try_into()
444 .map_err(|_| Error::Config("invalid size".into()))?,
445 );
446 let content_hash = u64::from_le_bytes(
447 entry[28..36]
448 .try_into()
449 .map_err(|_| Error::Config("invalid hash".into()))?,
450 );
451
452 let path = self.string_pool.resolve(path_off)?;
453
454 Ok(FileInfo {
455 file_id,
456 path: PathBuf::from(path),
457 status,
458 mtime_ns,
459 size_bytes,
460 content_hash,
461 })
462 }
463
464 #[must_use]
469 pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
470 if !self.header.has_bloom() {
471 return true;
472 }
473
474 let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
475 let Some(bloom_bytes) = self.mmap.get(entry_off + 40..entry_off + 44) else {
476 return true;
477 };
478
479 let bloom_rel_off = match bloom_bytes.try_into() {
480 Ok(b) => u32::from_le_bytes(b),
481 Err(_) => return true,
482 };
483 let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
484
485 let Some(size_bytes) = self.mmap.get(bloom_abs_off..bloom_abs_off + 2) else {
486 return true;
487 };
488 let size = match size_bytes.try_into() {
489 Ok(b) => u16::from_le_bytes(b),
490 Err(_) => return true,
491 } as usize;
492
493 let num_hashes = self.mmap.get(bloom_abs_off + 2).copied().unwrap_or(0);
494 let Some(bits) = self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) else {
495 return true;
496 };
497
498 BloomFilter::slice_contains(bits, num_hashes, trigram)
499 }
500
501 #[must_use]
503 pub const fn metadata(&self) -> ShardMetadata {
504 ShardMetadata {
505 shard_timestamp: self.header.created_at,
506 file_count: self.header.file_count,
507 trigram_count: self.header.trigram_count,
508 }
509 }
510
511 #[must_use]
520 pub fn is_stale(&self, path: &Path) -> bool {
521 let Ok(current) = std::fs::metadata(path) else {
522 return true;
523 };
524
525 if current.len() as usize != self.mmap.len() {
526 return true;
527 }
528
529 #[cfg(unix)]
530 {
531 if let Some(stored_inode) = self.inode
532 && current.ino() != stored_inode
533 {
534 return true;
535 }
536 }
537
538 false
539 }
540}