Skip to main content

ix/
reader.rs

1//! Index reader — the mmap-based query-time interface.
2//!
3//! Fast, zero-copy access to the index data.
4
5use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::*;
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16pub struct Reader {
17    mmap: Mmap,
18    pub header: Header,
19    string_pool: StringPoolReader<'static>,
20}
21
22#[derive(Debug)]
23pub struct TrigramInfo {
24    pub posting_offset: u64,
25    pub posting_length: u32,
26    pub doc_frequency: u32,
27}
28
29#[derive(Debug)]
30pub struct FileInfo {
31    pub file_id: u32,
32    pub path: PathBuf,
33    pub status: FileStatus,
34    pub mtime_ns: u64,
35    pub size_bytes: u64,
36    pub content_hash: u64,
37}
38
39impl Reader {
40    pub fn open(path: &Path) -> Result<Self> {
41        let file = File::open(path)?;
42        let mmap = unsafe { Mmap::map(&file)? };
43
44        if mmap.len() < HEADER_SIZE {
45            return Err(Error::IndexTooSmall);
46        }
47
48        let header = Header::parse(&mmap[0..HEADER_SIZE])?;
49        header.validate_bounds(mmap.len() as u64)?;
50
51        // Safety: we are extending the lifetime of the slice to 'static.
52        // This is okay because 'Reader' owns the 'Mmap' which owns the data.
53        let string_pool_data: &'static [u8] = unsafe {
54            let slice = &mmap[header.string_pool_offset as usize
55                ..(header.string_pool_offset + header.string_pool_size) as usize];
56            std::mem::transmute(slice)
57        };
58        let string_pool = StringPoolReader::new(string_pool_data)?;
59
60        Ok(Self {
61            mmap,
62            header,
63            string_pool,
64        })
65    }
66
67    pub fn get_last_modified(root: &Path) -> Result<u64> {
68        let mut last_modified = 0u64;
69        let walker = ignore::WalkBuilder::new(root)
70            .hidden(false)
71            .git_ignore(true)
72            .require_git(false)
73            .add_custom_ignore_filename(".ixignore")
74            .filter_entry(move |entry| {
75                let path = entry.path();
76                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
77                
78                // Built-in directory defaults
79                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
80                    && (name == "lost+found" || name == ".git" || name == "node_modules" || 
81                       name == "target" || name == "__pycache__" || name == ".tox" || 
82                       name == ".venv" || name == "venv" || name == ".ix") 
83                {
84                    return false;
85                }
86
87                // Built-in file extension defaults
88                if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
89                    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
90                    match ext {
91                        // Binary extensions
92                        "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
93                        // Media
94                        "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
95                        // Archives
96                        "zip" | "7z" | "rar" |
97                        // Data
98                        "sqlite" | "db" | "bin" => return false,
99                        _ => {}
100                    }
101                    if name.ends_with(".tar.gz") {
102                        return false;
103                    }
104                }
105                true
106            })
107            .build();
108
109        for result in walker {
110            match result {
111                Ok(entry) => {
112                    if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
113                        let metadata = entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
114                        let mtime = metadata
115                            .modified()
116                            .and_then(|t| {
117                                t.duration_since(UNIX_EPOCH)
118                                    .map_err(|_| std::io::Error::other("time went backwards"))
119                            })
120                            .map(|d| d.as_micros() as u64)
121                            .unwrap_or(0);
122                        if mtime > last_modified {
123                            last_modified = mtime;
124                        }
125                    }
126                }
127                Err(e) => {
128                    eprintln!("ix: warning: stale check skipping path: {}", e);
129                }
130            }
131        }
132        Ok(last_modified)
133    }
134
135    pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
136        let count = self.header.trigram_count as usize;
137        let table_start = self.header.trigram_table_offset as usize;
138
139        let mut low = 0;
140        let mut high = count;
141
142        while low < high {
143            let mid = low + (high - low) / 2;
144            let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;
145
146            // Read trigram key (first 4 bytes)
147            let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
148            let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
149
150            if key == trigram {
151                let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;
152
153                // Read posting_offset (u48, bytes 4..10)
154                let mut off_bytes = [0u8; 8];
155                off_bytes[..6].copy_from_slice(&entry[4..10]);
156                let posting_offset = u64::from_le_bytes(off_bytes);
157
158                let posting_length = entry
159                    .get(10..14)
160                    .and_then(|s| s.try_into().ok())
161                    .map(u32::from_le_bytes)
162                    .unwrap_or(0);
163                let doc_frequency = entry
164                    .get(14..18)
165                    .and_then(|s| s.try_into().ok())
166                    .map(u32::from_le_bytes)
167                    .unwrap_or(0);
168
169                return Some(TrigramInfo {
170                    posting_offset,
171                    posting_length,
172                    doc_frequency,
173                });
174            } else if key < trigram {
175                low = mid + 1;
176            } else {
177                high = mid;
178            }
179        }
180
181        None
182    }
183
184    pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
185        let start = info.posting_offset as usize;
186        let end = start + info.posting_length as usize;
187        if end > self.mmap.len() {
188            return Err(Error::PostingOutOfBounds);
189        }
190        PostingList::decode(&self.mmap[start..end])
191    }
192
193    pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
194        if file_id >= self.header.file_count {
195            return Err(Error::FileIdOutOfBounds(file_id));
196        }
197
198        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
199        let entry = self
200            .mmap
201            .get(entry_off..entry_off + FILE_ENTRY_SIZE)
202            .ok_or(Error::SectionOutOfBounds {
203                section: "file_entry",
204                offset: entry_off as u64,
205                size: FILE_ENTRY_SIZE as u64,
206                file_len: self.mmap.len() as u64,
207            })?;
208
209        let path_off = u32::from_le_bytes(
210            entry[4..8]
211                .try_into()
212                .map_err(|_| Error::Config("invalid path offset".into()))?,
213        );
214        let status = FileStatus::from_u8(entry[10]);
215        let mtime_ns = u64::from_le_bytes(
216            entry[12..20]
217                .try_into()
218                .map_err(|_| Error::Config("invalid mtime".into()))?,
219        );
220        let size_bytes = u64::from_le_bytes(
221            entry[20..28]
222                .try_into()
223                .map_err(|_| Error::Config("invalid size".into()))?,
224        );
225        let content_hash = u64::from_le_bytes(
226            entry[28..36]
227                .try_into()
228                .map_err(|_| Error::Config("invalid hash".into()))?,
229        );
230
231        let path = self.string_pool.resolve(path_off)?;
232
233        Ok(FileInfo {
234            file_id,
235            path: PathBuf::from(path),
236            status,
237            mtime_ns,
238            size_bytes,
239            content_hash,
240        })
241    }
242
243    pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
244        if !self.header.has_bloom() {
245            return true;
246        }
247
248        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
249        let bloom_rel_off_bytes = self.mmap.get(entry_off + 40..entry_off + 44);
250        if bloom_rel_off_bytes.is_none() {
251            return true;
252        }
253
254        let bloom_rel_off = bloom_rel_off_bytes
255            .and_then(|b| b.try_into().ok())
256            .map(u32::from_le_bytes)
257            .unwrap_or(0);
258
259        let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
260        if bloom_abs_off + 4 > self.mmap.len() {
261            return true;
262        }
263
264        let size = u16::from_le_bytes(
265            self.mmap[bloom_abs_off..bloom_abs_off + 2]
266                .try_into()
267                .unwrap_or([0u8; 2]),
268        ) as usize;
269        let num_hashes = self.mmap[bloom_abs_off + 2];
270        let bits = match self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) {
271            Some(b) => b,
272            None => return true,
273        };
274
275        BloomFilter::slice_contains(bits, num_hashes, trigram)
276    }
277}