Skip to main content

ix/
reader.rs

1//! Index reader — the mmap-based query-time interface.
2//!
3//! Fast, zero-copy access to the index data.
4
5use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::*;
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18
19#[derive(Debug, Clone, Copy)]
20pub struct ShardMetadata {
21    pub shard_timestamp: u64,
22    pub file_count: u32,
23    pub trigram_count: u32,
24}
25
26pub struct Reader {
27    mmap: Mmap,
28    pub header: Header,
29    string_pool: StringPoolReader<'static>,
30    inode: Option<u64>,
31}
32
33#[derive(Debug)]
34pub struct TrigramInfo {
35    pub posting_offset: u64,
36    pub posting_length: u32,
37    pub doc_frequency: u32,
38}
39
40#[derive(Debug)]
41pub struct FileInfo {
42    pub file_id: u32,
43    pub path: PathBuf,
44    pub status: FileStatus,
45    pub mtime_ns: u64,
46    pub size_bytes: u64,
47    pub content_hash: u64,
48}
49
50impl Reader {
51    pub fn open(path: &Path) -> Result<Self> {
52        let file = File::open(path)?;
53
54        // SAFETY: Mmap::map wraps the mmap(2) syscall. The file handle is kept alive
55        // by Mmap's internal Arc<File>, ensuring the underlying data remains valid
56        // for the lifetime of the mmap.
57        let mmap = unsafe { Mmap::map(&file)? };
58
59        if mmap.len() < HEADER_SIZE {
60            return Err(Error::IndexTooSmall);
61        }
62
63        let header = Header::parse(&mmap[0..HEADER_SIZE])?;
64        header.validate_bounds(mmap.len() as u64)?;
65
66        #[cfg(unix)]
67        let inode = Some(file.metadata()?.ino());
68
69        #[cfg(not(unix))]
70        let inode = None;
71
72        // SAFETY: We transmute the slice lifetime to 'static. This is sound because:
73        // INVARIANT: Reader owns the Mmap, which owns the underlying memory.
74        // INVARIANT: Mmap's data remains valid for the entire lifetime of Reader.
75        // INVARIANT: No mutable access to mmap occurs after construction.
76        // INVARIANT: StringPoolReader<'static> cannot outlive Reader (it's a field).
77        // This is the standard pattern for self-referential mmap structs in Rust.
78        let string_pool_data: &'static [u8] = unsafe {
79            let start = header.string_pool_offset as usize;
80            let end = (header.string_pool_offset + header.string_pool_size) as usize;
81            std::mem::transmute::<&[u8], &'static [u8]>(&mmap[start..end])
82        };
83        let string_pool = StringPoolReader::new(string_pool_data)?;
84
85        Ok(Self {
86            mmap,
87            header,
88            string_pool,
89            inode,
90        })
91    }
92
93    pub fn get_last_modified(root: &Path) -> Result<u64> {
94        let mut last_modified = 0u64;
95        let walker = ignore::WalkBuilder::new(root)
96            .hidden(false)
97            .git_ignore(true)
98            .require_git(false)
99            .add_custom_ignore_filename(".ixignore")
100            .filter_entry(move |entry| {
101                let path = entry.path();
102                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
103
104                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
105                    && matches!(
106                        name,
107                        "lost+found"
108                            | ".git"
109                            | "node_modules"
110                            | "target"
111                            | "__pycache__"
112                            | ".tox"
113                            | ".venv"
114                            | "venv"
115                            | ".ix"
116                    )
117                {
118                    return false;
119                }
120
121                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
122                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
123                    if matches!(
124                        ext,
125                        "so" | "o"
126                            | "dylib"
127                            | "a"
128                            | "dll"
129                            | "exe"
130                            | "pyc"
131                            | "jpg"
132                            | "png"
133                            | "gif"
134                            | "mp4"
135                            | "mp3"
136                            | "pdf"
137                            | "zip"
138                            | "7z"
139                            | "rar"
140                            | "sqlite"
141                            | "db"
142                            | "bin"
143                    ) || name.ends_with(".tar.gz")
144                    {
145                        return false;
146                    }
147                }
148                true
149            })
150            .build();
151
152        for result in walker {
153            match result {
154                Ok(entry) => {
155                    if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
156                        let metadata =
157                            entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
158                        let mtime = metadata
159                            .modified()
160                            .and_then(|t| {
161                                t.duration_since(UNIX_EPOCH)
162                                    .map_err(|_| std::io::Error::other("time went backwards"))
163                            })
164                            .map(|d| d.as_micros() as u64)
165                            .unwrap_or(0);
166                        if mtime > last_modified {
167                            last_modified = mtime;
168                        }
169                    }
170                }
171                Err(e) => {
172                    eprintln!("ix: warning: stale check skipping path: {}", e);
173                }
174            }
175        }
176        Ok(last_modified)
177    }
178
179    pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
180        let count = self.header.trigram_count as usize;
181        let table_start = self.header.trigram_table_offset as usize;
182
183        let mut low = 0;
184        let mut high = count;
185
186        while low < high {
187            let mid = low + (high - low) / 2;
188            let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;
189
190            let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
191            let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
192
193            if key == trigram {
194                let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;
195
196                let mut off_bytes = [0u8; 8];
197                off_bytes[..6].copy_from_slice(&entry[4..10]);
198                let posting_offset = u64::from_le_bytes(off_bytes);
199
200                let posting_length = entry
201                    .get(10..14)
202                    .and_then(|s| s.try_into().ok())
203                    .map(u32::from_le_bytes)?;
204
205                let doc_frequency = entry
206                    .get(14..18)
207                    .and_then(|s| s.try_into().ok())
208                    .map(u32::from_le_bytes)?;
209
210                return Some(TrigramInfo {
211                    posting_offset,
212                    posting_length,
213                    doc_frequency,
214                });
215            } else if key < trigram {
216                low = mid + 1;
217            } else {
218                high = mid;
219            }
220        }
221
222        None
223    }
224
225    pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
226        let start = info.posting_offset as usize;
227        let end = start + info.posting_length as usize;
228        if end > self.mmap.len() {
229            return Err(Error::PostingOutOfBounds);
230        }
231        PostingList::decode(&self.mmap[start..end])
232    }
233
234    pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
235        if file_id >= self.header.file_count {
236            return Err(Error::FileIdOutOfBounds(file_id));
237        }
238
239        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
240        let entry = self
241            .mmap
242            .get(entry_off..entry_off + FILE_ENTRY_SIZE)
243            .ok_or(Error::SectionOutOfBounds {
244                section: "file_entry",
245                offset: entry_off as u64,
246                size: FILE_ENTRY_SIZE as u64,
247                file_len: self.mmap.len() as u64,
248            })?;
249
250        let path_off = u32::from_le_bytes(
251            entry[4..8]
252                .try_into()
253                .map_err(|_| Error::Config("invalid path offset".into()))?,
254        );
255        let status = FileStatus::from_u8(entry[10]);
256        let mtime_ns = u64::from_le_bytes(
257            entry[12..20]
258                .try_into()
259                .map_err(|_| Error::Config("invalid mtime".into()))?,
260        );
261        let size_bytes = u64::from_le_bytes(
262            entry[20..28]
263                .try_into()
264                .map_err(|_| Error::Config("invalid size".into()))?,
265        );
266        let content_hash = u64::from_le_bytes(
267            entry[28..36]
268                .try_into()
269                .map_err(|_| Error::Config("invalid hash".into()))?,
270        );
271
272        let path = self.string_pool.resolve(path_off)?;
273
274        Ok(FileInfo {
275            file_id,
276            path: PathBuf::from(path),
277            status,
278            mtime_ns,
279            size_bytes,
280            content_hash,
281        })
282    }
283
284    pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
285        if !self.header.has_bloom() {
286            return true;
287        }
288
289        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
290        let Some(bloom_bytes) = self.mmap.get(entry_off + 40..entry_off + 44) else {
291            return true;
292        };
293
294        let bloom_rel_off = u32::from_le_bytes(
295            bloom_bytes
296                .try_into()
297                .expect("bloom_bytes is exactly 4 bytes"),
298        );
299        let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
300
301        let Some(size_bytes) = self.mmap.get(bloom_abs_off..bloom_abs_off + 2) else {
302            return true;
303        };
304        let size = u16::from_le_bytes(
305            size_bytes
306                .try_into()
307                .expect("size_bytes is exactly 2 bytes"),
308        ) as usize;
309
310        let num_hashes = self.mmap.get(bloom_abs_off + 2).copied().unwrap_or(0);
311        let Some(bits) = self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) else {
312            return true;
313        };
314
315        BloomFilter::slice_contains(bits, num_hashes, trigram)
316    }
317
318    pub fn metadata(&self) -> ShardMetadata {
319        ShardMetadata {
320            shard_timestamp: self.header.created_at,
321            file_count: self.header.file_count,
322            trigram_count: self.header.trigram_count,
323        }
324    }
325
326    /// Detect whether the shard file on disk has been rebuilt under this live mmap.
327    ///
328    /// Returns `true` if the inode or file size differs, or if the file no longer exists.
329    /// A stale reader should be dropped and reopened.
330    ///
331    /// On Unix: uses inode comparison (inode changes on atomic rename).
332    /// On non-Unix: uses file size comparison only (Windows file locking prevents
333    /// rebuild under live mmap, so size-only detection is sufficient).
334    pub fn is_stale(&self, path: &Path) -> bool {
335        let current = match std::fs::metadata(path) {
336            Ok(m) => m,
337            Err(_) => return true,
338        };
339
340        if current.len() as usize != self.mmap.len() {
341            return true;
342        }
343
344        #[cfg(unix)]
345        {
346            if let Some(stored_inode) = self.inode
347                && current.ino() != stored_inode
348            {
349                return true;
350            }
351        }
352
353        false
354    }
355}