Skip to main content

ix/
reader.rs

1//! Index reader — the mmap-based query-time interface.
2//!
3//! Fast, zero-copy access to the index data.
4
5use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::{Header, FileStatus, HEADER_SIZE, TRIGRAM_ENTRY_SIZE, FILE_ENTRY_SIZE};
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18
19/// Lightweight snapshot of shard-level metadata (no mmap needed).
20#[derive(Debug, Clone, Copy)]
21pub struct ShardMetadata {
22    /// Microsecond-precision Unix timestamp from the shard header.
23    pub shard_timestamp: u64,
24    /// Total number of files indexed in this shard.
25    pub file_count: u32,
26    /// Total number of unique trigrams in this shard.
27    pub trigram_count: u32,
28}
29
30/// Index reader — mmaps the shard file for zero-copy lookups.
31pub struct Reader {
32    mmap: Mmap,
33    /// Parsed shard header containing section offsets and sizes.
34    pub header: Header,
35    string_pool: StringPoolReader<'static>,
36    inode: Option<u64>,
37}
38
39/// Descriptor pointing into the trigram table for a single trigram.
40#[derive(Debug)]
41pub struct TrigramInfo {
42    /// Absolute file offset where the posting list begins.
43    pub posting_offset: u64,
44    /// Number of bytes in the encoded posting list.
45    pub posting_length: u32,
46    /// How many files contain this trigram (document frequency).
47    pub doc_frequency: u32,
48}
49
50/// Metadata about a single file known to the index.
51#[derive(Debug)]
52pub struct FileInfo {
53    /// Internal 0-based file identifier.
54    pub file_id: u32,
55    /// Absolute path to the file on disk.
56    pub path: PathBuf,
57    /// Whether the file is fresh, stale, or deleted.
58    pub status: FileStatus,
59    /// Last modification time in nanoseconds since the Unix epoch.
60    pub mtime_ns: u64,
61    /// File size in bytes at index time.
62    pub size_bytes: u64,
63    /// XXH64 content hash computed at index time.
64    pub content_hash: u64,
65}
66
67#[allow(clippy::as_conversions)] // binary format: usize/u32/u64 casts for index decoding
68#[allow(clippy::indexing_slicing)] // binary format: fixed-size buffer ops, length-checked
69impl Reader {
70    /// Open and memory-map an index file for reading.
71    ///
72    /// # Errors
73    ///
74    /// Returns an error if the file cannot be opened, memory-mapped, or its
75    /// header is invalid.
76    pub fn open(path: &Path) -> Result<Self> {
77        let file = File::open(path)?;
78
79        // SAFETY: Mmap::map wraps the mmap(2) syscall. The file handle is kept alive
80        // by Mmap's internal Arc<File>, ensuring the underlying data remains valid
81        // for the lifetime of the mmap.
82        let mmap = unsafe { Mmap::map(&file)? };
83
84        if mmap.len() < HEADER_SIZE {
85            return Err(Error::IndexTooSmall);
86        }
87
88        let header = Header::parse(&mmap[0..HEADER_SIZE])?;
89        header.validate_bounds(mmap.len() as u64)?;
90
91        #[cfg(unix)]
92        let inode = Some(file.metadata()?.ino());
93
94        #[cfg(not(unix))]
95        let inode = None;
96
97        // SAFETY: We transmute the slice lifetime to 'static. This is sound because:
98        // INVARIANT: Reader owns the Mmap, which owns the underlying memory.
99        // INVARIANT: Mmap's data remains valid for the entire lifetime of Reader.
100        // INVARIANT: No mutable access to mmap occurs after construction.
101        // INVARIANT: StringPoolReader<'static> cannot outlive Reader (it's a field).
102        // This is the standard pattern for self-referential mmap structs in Rust.
103        let string_pool_data: &'static [u8] = unsafe {
104            let start = header.string_pool_offset as usize;
105            let end = (header.string_pool_offset + header.string_pool_size) as usize;
106            std::mem::transmute::<&[u8], &'static [u8]>(&mmap[start..end])
107        };
108        let string_pool = StringPoolReader::new(string_pool_data)?;
109
110        Ok(Self {
111            mmap,
112            header,
113            string_pool,
114            inode,
115        })
116    }
117
118    /// Get the last modification time among all source files in the tree.
119    ///
120    /// # Errors
121    ///
122    /// Returns an error if the directory walk fails or metadata cannot be read.
123    pub fn get_last_modified(root: &Path) -> Result<u64> {
124        let mut last_modified = 0u64;
125        let walker = ignore::WalkBuilder::new(root)
126            .hidden(false)
127            .git_ignore(true)
128            .require_git(false)
129            .add_custom_ignore_filename(".ixignore")
130            .filter_entry(move |entry| {
131                let path = entry.path();
132                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
133
134                if entry.file_type().is_some_and(|t| t.is_dir())
135                    && matches!(
136                        name,
137                        "lost+found"
138                            | ".git"
139                            | "node_modules"
140                            | "target"
141                            | "__pycache__"
142                            | ".tox"
143                            | ".venv"
144                            | "venv"
145                            | ".ix"
146                    )
147                {
148                    return false;
149                }
150
151                if entry.file_type().is_some_and(|t| t.is_file()) {
152                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
153                    if matches!(
154                        ext,
155                        "so" | "o"
156                            | "dylib"
157                            | "a"
158                            | "dll"
159                            | "exe"
160                            | "pyc"
161                            | "jpg"
162                            | "png"
163                            | "gif"
164                            | "mp4"
165                            | "mp3"
166                            | "pdf"
167                            | "zip"
168                            | "7z"
169                            | "rar"
170                            | "sqlite"
171                            | "db"
172                            | "bin"
173                    ) || name.ends_with(".tar.gz")
174                    {
175                        return false;
176                    }
177                }
178                true
179            })
180            .build();
181
182        for result in walker {
183            match result {
184                Ok(entry) => {
185                    if entry.file_type().is_some_and(|t| t.is_file()) {
186                        let metadata =
187                            entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
188                        let mtime = metadata
189                            .modified()
190                            .and_then(|t| {
191                                t.duration_since(UNIX_EPOCH)
192                                    .map_err(|_| std::io::Error::other("time went backwards"))
193                            })
194                            .map_or(0, |d| d.as_micros() as u64);
195                        if mtime > last_modified {
196                            last_modified = mtime;
197                        }
198                    }
199                }
200                Err(e) => {
201                    eprintln!("ix: warning: stale check skipping path: {e}");
202                }
203            }
204        }
205        Ok(last_modified)
206    }
207
208    /// Binary search the sorted trigram table. Returns `None` if the trigram
209    /// is unknown.
210    pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
211        let count = self.header.trigram_count as usize;
212        let table_start = self.header.trigram_table_offset as usize;
213
214        let mut low = 0;
215        let mut high = count;
216
217        while low < high {
218            let mid = low + (high - low) / 2;
219            let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;
220
221            let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
222            let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
223
224            match key.cmp(&trigram) {
225                std::cmp::Ordering::Equal => {
226                    let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;
227
228                    let mut off_bytes = [0u8; 8];
229                    off_bytes[..6].copy_from_slice(&entry[4..10]);
230                    let posting_offset = u64::from_le_bytes(off_bytes);
231
232                    let posting_length = entry
233                        .get(10..14)
234                        .and_then(|s| s.try_into().ok())
235                        .map(u32::from_le_bytes)?;
236
237                    let doc_frequency = entry
238                        .get(14..18)
239                        .and_then(|s| s.try_into().ok())
240                        .map(u32::from_le_bytes)?;
241
242                    return Some(TrigramInfo {
243                        posting_offset,
244                        posting_length,
245                        doc_frequency,
246                    });
247                }
248                std::cmp::Ordering::Less => low = mid + 1,
249                std::cmp::Ordering::Greater => high = mid,
250            }
251        }
252
253        None
254    }
255
256    /// Decode the posting list for a given trigram info.
257    ///
258    /// # Errors
259    ///
260    /// Returns an error if the posting data is out of bounds or corrupted.
261    pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
262        let start = info.posting_offset as usize;
263        let end = start + info.posting_length as usize;
264        if end > self.mmap.len() {
265            return Err(Error::PostingOutOfBounds);
266        }
267        PostingList::decode(&self.mmap[start..end])
268    }
269
270    /// Retrieve file metadata by its ID.
271    ///
272    /// # Errors
273    ///
274    /// Returns an error if the file ID is out of bounds or the file table entry
275    /// is malformed.
276    pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
277        if file_id >= self.header.file_count {
278            return Err(Error::FileIdOutOfBounds(file_id));
279        }
280
281        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
282        let entry = self
283            .mmap
284            .get(entry_off..entry_off + FILE_ENTRY_SIZE)
285            .ok_or(Error::SectionOutOfBounds {
286                section: "file_entry",
287                offset: entry_off as u64,
288                size: FILE_ENTRY_SIZE as u64,
289                file_len: self.mmap.len() as u64,
290            })?;
291
292        let path_off = u32::from_le_bytes(
293            entry[4..8]
294                .try_into()
295                .map_err(|_| Error::Config("invalid path offset".into()))?,
296        );
297        let status = FileStatus::from_u8(entry[10]);
298        let mtime_ns = u64::from_le_bytes(
299            entry[12..20]
300                .try_into()
301                .map_err(|_| Error::Config("invalid mtime".into()))?,
302        );
303        let size_bytes = u64::from_le_bytes(
304            entry[20..28]
305                .try_into()
306                .map_err(|_| Error::Config("invalid size".into()))?,
307        );
308        let content_hash = u64::from_le_bytes(
309            entry[28..36]
310                .try_into()
311                .map_err(|_| Error::Config("invalid hash".into()))?,
312        );
313
314        let path = self.string_pool.resolve(path_off)?;
315
316        Ok(FileInfo {
317            file_id,
318            path: PathBuf::from(path),
319            status,
320            mtime_ns,
321            size_bytes,
322            content_hash,
323        })
324    }
325
326    /// Check if a bloom filter for a file may contain a trigram.
327    ///
328    /// # Panics
329    ///
330    /// Panics if the bloom filter bytes in the mmap are not exactly 2 or 4 bytes
331    /// as expected (this should never happen with a valid index file).
332    #[must_use] 
333    pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
334        if !self.header.has_bloom() {
335            return true;
336        }
337
338        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
339        let Some(bloom_bytes) = self.mmap.get(entry_off + 40..entry_off + 44) else {
340            return true;
341        };
342
343        let bloom_rel_off = u32::from_le_bytes(
344            bloom_bytes
345                .try_into()
346                .expect("bloom_bytes is exactly 4 bytes"),
347        );
348        let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
349
350        let Some(size_bytes) = self.mmap.get(bloom_abs_off..bloom_abs_off + 2) else {
351            return true;
352        };
353        let size = u16::from_le_bytes(
354            size_bytes
355                .try_into()
356                .expect("size_bytes is exactly 2 bytes"),
357        ) as usize;
358
359        let num_hashes = self.mmap.get(bloom_abs_off + 2).copied().unwrap_or(0);
360        let Some(bits) = self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) else {
361            return true;
362        };
363
364        BloomFilter::slice_contains(bits, num_hashes, trigram)
365    }
366
367    /// Retrieve high-level shard metadata without parsing the full header.
368    #[must_use]
369    pub const fn metadata(&self) -> ShardMetadata {
370        ShardMetadata {
371            shard_timestamp: self.header.created_at,
372            file_count: self.header.file_count,
373            trigram_count: self.header.trigram_count,
374        }
375    }
376
377    /// Detect whether the shard file on disk has been rebuilt under this live mmap.
378    ///
379    /// Returns `true` if the inode or file size differs, or if the file no longer exists.
380    /// A stale reader should be dropped and reopened.
381    ///
382    /// On Unix: uses inode comparison (inode changes on atomic rename).
383    /// On non-Unix: uses file size comparison only (Windows file locking prevents
384    /// rebuild under live mmap, so size-only detection is sufficient).
385    #[must_use] 
386    pub fn is_stale(&self, path: &Path) -> bool {
387        let Ok(current) = std::fs::metadata(path) else {
388            return true;
389        };
390
391        if current.len() as usize != self.mmap.len() {
392            return true;
393        }
394
395        #[cfg(unix)]
396        {
397            if let Some(stored_inode) = self.inode
398                && current.ino() != stored_inode
399            {
400                return true;
401            }
402        }
403
404        false
405    }
406}