Skip to main content

ix/
reader.rs

1//! Index reader — the mmap-based query-time interface.
2//!
3//! Fast, zero-copy access to the index data.
4
5use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::{FILE_ENTRY_SIZE, FileStatus, HEADER_SIZE, Header};
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18
19/// Lightweight snapshot of shard-level metadata (no mmap needed).
20#[derive(Debug, Clone, Copy)]
21pub struct ShardMetadata {
22    /// Microsecond-precision Unix timestamp from the shard header.
23    pub shard_timestamp: u64,
24    /// Total number of files indexed in this shard.
25    pub file_count: u32,
26    /// Total number of unique trigrams in this shard.
27    pub trigram_count: u32,
28}
29
30/// One entry in the CDX block index: first trigram key + absolute block offset.
31#[derive(Debug, Clone, Copy)]
32pub struct CdxBlockEntry {
33    /// First trigram key in this block.
34    pub first_key: u32,
35    /// Absolute byte offset of the compressed block.
36    pub block_offset: u64,
37}
38
39/// Index reader — mmaps the shard file for zero-copy lookups.
40pub struct Reader {
41    mmap: Mmap,
42    /// Parsed shard header containing section offsets and sizes.
43    pub header: Header,
44    string_pool: StringPoolReader<'static>,
45    inode: Option<u64>,
46    cdx_blocks: Vec<CdxBlockEntry>,
47}
48
49/// Descriptor pointing into the trigram table for a single trigram.
50#[derive(Debug)]
51pub struct TrigramInfo {
52    /// Absolute file offset where the posting list begins.
53    pub posting_offset: u64,
54    /// Number of bytes in the encoded posting list.
55    pub posting_length: u32,
56    /// How many files contain this trigram (document frequency).
57    pub doc_frequency: u32,
58}
59
60/// Metadata about a single file known to the index.
61#[derive(Debug)]
62pub struct FileInfo {
63    /// Internal 0-based file identifier.
64    pub file_id: u32,
65    /// Absolute path to the file on disk.
66    pub path: PathBuf,
67    /// Whether the file is fresh, stale, or deleted.
68    pub status: FileStatus,
69    /// Last modification time in nanoseconds since the Unix epoch.
70    pub mtime_ns: u64,
71    /// File size in bytes at index time.
72    pub size_bytes: u64,
73    /// XXH64 content hash computed at index time.
74    pub content_hash: u64,
75}
76
77#[allow(clippy::as_conversions)] // binary format: usize/u32/u64 casts for index decoding
78#[allow(clippy::indexing_slicing)] // binary format: fixed-size buffer ops, length-checked
79impl Reader {
80    /// Open and memory-map an index file for reading.
81    ///
82    /// # Errors
83    ///
84    /// Returns an error if the file cannot be opened, memory-mapped, or its
85    /// header is invalid.
86    pub fn open(path: &Path) -> Result<Self> {
87        let file = File::open(path)?;
88
89        // SAFETY: Mmap::map wraps the mmap(2) syscall. The file handle is kept alive
90        // by Mmap's internal Arc<File>, ensuring the underlying data remains valid
91        // for the lifetime of the mmap.
92        let mmap = unsafe { Mmap::map(&file)? };
93
94        if mmap.len() < HEADER_SIZE {
95            return Err(Error::IndexTooSmall);
96        }
97
98        let header = Header::parse(&mmap[0..HEADER_SIZE])?;
99        header.validate_bounds(mmap.len() as u64)?;
100
101        #[cfg(unix)]
102        let inode = Some(file.metadata()?.ino());
103
104        #[cfg(not(unix))]
105        let inode = None;
106
107        // SAFETY: We transmute the slice lifetime to 'static. This is sound because:
108        // INVARIANT: Reader owns the Mmap, which owns the underlying memory.
109        // INVARIANT: Mmap's data remains valid for the entire lifetime of Reader.
110        // INVARIANT: No mutable access to mmap occurs after construction.
111        // INVARIANT: StringPoolReader<'static> cannot outlive Reader (it's a field).
112        // This is the standard pattern for self-referential mmap structs in Rust.
113        let string_pool_data: &'static [u8] = unsafe {
114            let start = header.string_pool_offset as usize;
115            let end = (header.string_pool_offset + header.string_pool_size) as usize;
116            std::mem::transmute::<&[u8], &'static [u8]>(&mmap[start..end])
117        };
118        let string_pool = StringPoolReader::new(string_pool_data)?;
119
120        let cdx_blocks = if header.has_cdx() && header.cdx_block_index_size > 0 {
121            let idx_start = header.cdx_block_index_offset as usize;
122            let idx_end = idx_start + header.cdx_block_index_size as usize;
123            let idx_data = mmap
124                .get(idx_start..idx_end)
125                .ok_or(Error::SectionOutOfBounds {
126                    section: "cdx_block_index",
127                    offset: header.cdx_block_index_offset,
128                    size: header.cdx_block_index_size,
129                    file_len: mmap.len() as u64,
130                })?;
131            let mut blocks = Vec::new();
132            let mut pos = 0;
133            while pos + 12 <= idx_data.len() {
134                let first_key = u32::from_le_bytes(
135                    idx_data[pos..pos + 4]
136                        .try_into()
137                        .map_err(|_| Error::Config("bad cdx key".into()))?,
138                );
139                if first_key == u32::MAX {
140                    break;
141                }
142                let block_offset = u64::from_le_bytes(
143                    idx_data[pos + 4..pos + 12]
144                        .try_into()
145                        .map_err(|_| Error::Config("bad cdx offset".into()))?,
146                );
147                blocks.push(CdxBlockEntry {
148                    first_key,
149                    block_offset,
150                });
151                pos += 12;
152            }
153            blocks
154        } else {
155            Vec::new()
156        };
157
158        Ok(Self {
159            mmap,
160            header,
161            string_pool,
162            inode,
163            cdx_blocks,
164        })
165    }
166
167    /// Get the last modification time among all source files in the tree.
168    ///
169    /// # Errors
170    ///
171    /// Returns an error if the directory walk fails or metadata cannot be read.
172    pub fn get_last_modified(root: &Path) -> Result<u64> {
173        let mut last_modified = 0u64;
174        let walker = ignore::WalkBuilder::new(root)
175            .hidden(false)
176            .git_ignore(true)
177            .require_git(false)
178            .add_custom_ignore_filename(".ixignore")
179            .filter_entry(move |entry| {
180                let path = entry.path();
181                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
182
183                if entry.file_type().is_some_and(|t| t.is_dir())
184                    && matches!(
185                        name,
186                        "lost+found"
187                            | ".git"
188                            | "node_modules"
189                            | "target"
190                            | "__pycache__"
191                            | ".tox"
192                            | ".venv"
193                            | "venv"
194                            | ".ix"
195                    )
196                {
197                    return false;
198                }
199
200                if entry.file_type().is_some_and(|t| t.is_file()) {
201                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
202                    if matches!(
203                        ext,
204                        "so" | "o"
205                            | "dylib"
206                            | "a"
207                            | "dll"
208                            | "exe"
209                            | "pyc"
210                            | "jpg"
211                            | "png"
212                            | "gif"
213                            | "mp4"
214                            | "mp3"
215                            | "pdf"
216                            | "zip"
217                            | "7z"
218                            | "rar"
219                            | "sqlite"
220                            | "db"
221                            | "bin"
222                    ) || name.ends_with(".tar.gz")
223                    {
224                        return false;
225                    }
226                }
227                true
228            })
229            .build();
230
231        for result in walker {
232            match result {
233                Ok(entry) => {
234                    if entry.file_type().is_some_and(|t| t.is_file()) {
235                        let metadata =
236                            entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
237                        let mtime = metadata
238                            .modified()
239                            .and_then(|t| {
240                                t.duration_since(UNIX_EPOCH)
241                                    .map_err(|_| std::io::Error::other("time went backwards"))
242                            })
243                            .map_or(0, |d| d.as_micros() as u64);
244                        if mtime > last_modified {
245                            last_modified = mtime;
246                        }
247                    }
248                }
249                Err(e) => {
250                    eprintln!("ix: warning: stale check skipping path: {e}");
251                }
252            }
253        }
254        Ok(last_modified)
255    }
256
257    /// Binary search the trigram table. Returns `None` if the trigram
258    /// is unknown.
259    ///
260    /// When CDX compression is active, performs a two-level search:
261    /// first on the block index, then within the decompressed block.
262    pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
263        if self.header.has_cdx() && !self.cdx_blocks.is_empty() {
264            return self.get_trigram_cdx(trigram);
265        }
266
267        // Legacy fallback (no CDX)
268        let count = self.header.trigram_count as usize;
269        let table_start = self.header.trigram_table_offset as usize;
270        let entry_size = crate::format::TRIGRAM_ENTRY_SIZE;
271
272        let mut low = 0;
273        let mut high = count;
274
275        while low < high {
276            let mid = low + (high - low) / 2;
277            let entry_off = table_start + mid * entry_size;
278
279            let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
280            let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
281
282            match key.cmp(&trigram) {
283                std::cmp::Ordering::Equal => {
284                    let entry = self.mmap.get(entry_off..entry_off + entry_size)?;
285
286                    let mut off_bytes = [0u8; 8];
287                    off_bytes[..6].copy_from_slice(&entry[4..10]);
288                    let posting_offset = u64::from_le_bytes(off_bytes);
289
290                    let posting_length = entry
291                        .get(10..14)
292                        .and_then(|s| s.try_into().ok())
293                        .map(u32::from_le_bytes)?;
294
295                    let doc_frequency = entry
296                        .get(14..18)
297                        .and_then(|s| s.try_into().ok())
298                        .map(u32::from_le_bytes)?;
299
300                    return Some(TrigramInfo {
301                        posting_offset,
302                        posting_length,
303                        doc_frequency,
304                    });
305                }
306                std::cmp::Ordering::Less => low = mid + 1,
307                std::cmp::Ordering::Greater => high = mid,
308            }
309        }
310
311        None
312    }
313
314    fn get_trigram_cdx(&self, trigram: Trigram) -> Option<TrigramInfo> {
315        let mut block_idx = 0;
316        for (i, entry) in self.cdx_blocks.iter().enumerate() {
317            if entry.first_key > trigram {
318                break;
319            }
320            block_idx = i;
321        }
322
323        let block_entry = self.cdx_blocks.get(block_idx)?;
324
325        let block_end = self.cdx_blocks.get(block_idx + 1).map_or_else(
326            || self.header.trigram_table_offset + self.header.trigram_table_size,
327            |next| next.block_offset,
328        );
329
330        let block_start = block_entry.block_offset as usize;
331        let block_end = block_end as usize;
332        let block_data = self.mmap.get(block_start..block_end)?;
333
334        let decompressed = match zstd::decode_all(block_data) {
335            Ok(d) => d,
336            Err(e) => {
337                tracing::warn!("ix: CDX block decompression failed: {e}");
338                return None;
339            }
340        };
341
342        let mut pos = 0;
343        let num_entries = match crate::varint::decode(&decompressed, &mut pos) {
344            Ok(v) => usize::try_from(v).unwrap_or(0),
345            Err(e) => {
346                tracing::warn!("ix: CDX num_entries varint decode failed: {e}");
347                return None;
348            }
349        };
350
351        let mut last_key = 0u32;
352        for _ in 0..num_entries {
353            let key_delta = match crate::varint::decode(&decompressed, &mut pos) {
354                Ok(v) => u32::try_from(v).unwrap_or(0),
355                Err(e) => {
356                    tracing::warn!("ix: CDX key_delta varint decode failed: {e}");
357                    return None;
358                }
359            };
360            let key = last_key + key_delta;
361            last_key = key;
362
363            let posting_offset = match crate::varint::decode(&decompressed, &mut pos) { Ok(v) => v, Err(e) => { tracing::warn!("ix: CDX posting_offset varint decode failed: {e}"); return None; } };
364            let posting_length = match crate::varint::decode(&decompressed, &mut pos) {
365                Ok(v) => u32::try_from(v).unwrap_or(0),
366                Err(e) => {
367                    tracing::warn!("ix: CDX posting_length varint decode failed: {e}");
368                    return None;
369                }
370            };
371            let doc_frequency = match crate::varint::decode(&decompressed, &mut pos) {
372                Ok(v) => u32::try_from(v).unwrap_or(0),
373                Err(e) => {
374                    tracing::warn!("ix: CDX doc_frequency varint decode failed: {e}");
375                    return None;
376                }
377            };
378
379            if key == trigram {
380                return Some(TrigramInfo {
381                    posting_offset,
382                    posting_length,
383                    doc_frequency,
384                });
385            }
386            if key > trigram {
387                break;
388            }
389        }
390
391        None
392    }
393
394    /// Decode the posting list for a given trigram info.
395    ///
396    /// # Errors
397    ///
398    /// Returns an error if the posting data is out of bounds or corrupted.
399    pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
400        let start = info.posting_offset as usize;
401        let end = start + info.posting_length as usize;
402        if end > self.mmap.len() {
403            return Err(Error::PostingOutOfBounds);
404        }
405        PostingList::decode(&self.mmap[start..end])
406    }
407
408    /// Retrieve file metadata by its ID.
409    ///
410    /// # Errors
411    ///
412    /// Returns an error if the file ID is out of bounds or the file table entry
413    /// is malformed.
414    pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
415        if file_id >= self.header.file_count {
416            return Err(Error::FileIdOutOfBounds(file_id));
417        }
418
419        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
420        let entry = self
421            .mmap
422            .get(entry_off..entry_off + FILE_ENTRY_SIZE)
423            .ok_or(Error::SectionOutOfBounds {
424                section: "file_entry",
425                offset: entry_off as u64,
426                size: FILE_ENTRY_SIZE as u64,
427                file_len: self.mmap.len() as u64,
428            })?;
429
430        let path_off = u32::from_le_bytes(
431            entry[4..8]
432                .try_into()
433                .map_err(|_| Error::Config("invalid path offset".into()))?,
434        );
435        let status = FileStatus::from_u8(entry[10]);
436        let mtime_ns = u64::from_le_bytes(
437            entry[12..20]
438                .try_into()
439                .map_err(|_| Error::Config("invalid mtime".into()))?,
440        );
441        let size_bytes = u64::from_le_bytes(
442            entry[20..28]
443                .try_into()
444                .map_err(|_| Error::Config("invalid size".into()))?,
445        );
446        let content_hash = u64::from_le_bytes(
447            entry[28..36]
448                .try_into()
449                .map_err(|_| Error::Config("invalid hash".into()))?,
450        );
451
452        let path = self.string_pool.resolve(path_off)?;
453
454        Ok(FileInfo {
455            file_id,
456            path: PathBuf::from(path),
457            status,
458            mtime_ns,
459            size_bytes,
460            content_hash,
461        })
462    }
463
464    /// Check if a bloom filter for a file may contain a trigram.
465    ///
466    /// Returns `true` if the trigram may be present (conservative) or if
467    /// any error occurs reading the bloom data (safe default: assume present).
468    #[must_use]
469    pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
470        if !self.header.has_bloom() {
471            return true;
472        }
473
474        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
475        let Some(bloom_bytes) = self.mmap.get(entry_off + 40..entry_off + 44) else {
476            return true;
477        };
478
479        let bloom_rel_off = match bloom_bytes.try_into() {
480            Ok(b) => u32::from_le_bytes(b),
481            Err(_) => return true,
482        };
483        let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
484
485        let Some(size_bytes) = self.mmap.get(bloom_abs_off..bloom_abs_off + 2) else {
486            return true;
487        };
488        let size = match size_bytes.try_into() {
489            Ok(b) => u16::from_le_bytes(b),
490            Err(_) => return true,
491        } as usize;
492
493        let num_hashes = self.mmap.get(bloom_abs_off + 2).copied().unwrap_or(0);
494        let Some(bits) = self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) else {
495            return true;
496        };
497
498        BloomFilter::slice_contains(bits, num_hashes, trigram)
499    }
500
501    /// Retrieve high-level shard metadata without parsing the full header.
502    #[must_use]
503    pub const fn metadata(&self) -> ShardMetadata {
504        ShardMetadata {
505            shard_timestamp: self.header.created_at,
506            file_count: self.header.file_count,
507            trigram_count: self.header.trigram_count,
508        }
509    }
510
511    /// Detect whether the shard file on disk has been rebuilt under this live mmap.
512    ///
513    /// Returns `true` if the inode or file size differs, or if the file no longer exists.
514    /// A stale reader should be dropped and reopened.
515    ///
516    /// On Unix: uses inode comparison (inode changes on atomic rename).
517    /// On non-Unix: uses file size comparison only (Windows file locking prevents
518    /// rebuild under live mmap, so size-only detection is sufficient).
519    #[must_use]
520    pub fn is_stale(&self, path: &Path) -> bool {
521        let Ok(current) = std::fs::metadata(path) else {
522            return true;
523        };
524
525        if current.len() as usize != self.mmap.len() {
526            return true;
527        }
528
529        #[cfg(unix)]
530        {
531            if let Some(stored_inode) = self.inode
532                && current.ino() != stored_inode
533            {
534                return true;
535            }
536        }
537
538        false
539    }
540}