1use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::{Header, FileStatus, HEADER_SIZE, TRIGRAM_ENTRY_SIZE, FILE_ENTRY_SIZE};
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16#[cfg(unix)]
17use std::os::unix::fs::MetadataExt;
18
19#[derive(Debug, Clone, Copy)]
21pub struct ShardMetadata {
22 pub shard_timestamp: u64,
24 pub file_count: u32,
26 pub trigram_count: u32,
28}
29
30pub struct Reader {
32 mmap: Mmap,
33 pub header: Header,
35 string_pool: StringPoolReader<'static>,
36 inode: Option<u64>,
37}
38
39#[derive(Debug)]
41pub struct TrigramInfo {
42 pub posting_offset: u64,
44 pub posting_length: u32,
46 pub doc_frequency: u32,
48}
49
50#[derive(Debug)]
52pub struct FileInfo {
53 pub file_id: u32,
55 pub path: PathBuf,
57 pub status: FileStatus,
59 pub mtime_ns: u64,
61 pub size_bytes: u64,
63 pub content_hash: u64,
65}
66
67#[allow(clippy::as_conversions)] #[allow(clippy::indexing_slicing)] impl Reader {
70 pub fn open(path: &Path) -> Result<Self> {
77 let file = File::open(path)?;
78
79 let mmap = unsafe { Mmap::map(&file)? };
83
84 if mmap.len() < HEADER_SIZE {
85 return Err(Error::IndexTooSmall);
86 }
87
88 let header = Header::parse(&mmap[0..HEADER_SIZE])?;
89 header.validate_bounds(mmap.len() as u64)?;
90
91 #[cfg(unix)]
92 let inode = Some(file.metadata()?.ino());
93
94 #[cfg(not(unix))]
95 let inode = None;
96
97 let string_pool_data: &'static [u8] = unsafe {
104 let start = header.string_pool_offset as usize;
105 let end = (header.string_pool_offset + header.string_pool_size) as usize;
106 std::mem::transmute::<&[u8], &'static [u8]>(&mmap[start..end])
107 };
108 let string_pool = StringPoolReader::new(string_pool_data)?;
109
110 Ok(Self {
111 mmap,
112 header,
113 string_pool,
114 inode,
115 })
116 }
117
118 pub fn get_last_modified(root: &Path) -> Result<u64> {
124 let mut last_modified = 0u64;
125 let walker = ignore::WalkBuilder::new(root)
126 .hidden(false)
127 .git_ignore(true)
128 .require_git(false)
129 .add_custom_ignore_filename(".ixignore")
130 .filter_entry(move |entry| {
131 let path = entry.path();
132 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
133
134 if entry.file_type().is_some_and(|t| t.is_dir())
135 && matches!(
136 name,
137 "lost+found"
138 | ".git"
139 | "node_modules"
140 | "target"
141 | "__pycache__"
142 | ".tox"
143 | ".venv"
144 | "venv"
145 | ".ix"
146 )
147 {
148 return false;
149 }
150
151 if entry.file_type().is_some_and(|t| t.is_file()) {
152 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
153 if matches!(
154 ext,
155 "so" | "o"
156 | "dylib"
157 | "a"
158 | "dll"
159 | "exe"
160 | "pyc"
161 | "jpg"
162 | "png"
163 | "gif"
164 | "mp4"
165 | "mp3"
166 | "pdf"
167 | "zip"
168 | "7z"
169 | "rar"
170 | "sqlite"
171 | "db"
172 | "bin"
173 ) || name.ends_with(".tar.gz")
174 {
175 return false;
176 }
177 }
178 true
179 })
180 .build();
181
182 for result in walker {
183 match result {
184 Ok(entry) => {
185 if entry.file_type().is_some_and(|t| t.is_file()) {
186 let metadata =
187 entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
188 let mtime = metadata
189 .modified()
190 .and_then(|t| {
191 t.duration_since(UNIX_EPOCH)
192 .map_err(|_| std::io::Error::other("time went backwards"))
193 })
194 .map_or(0, |d| d.as_micros() as u64);
195 if mtime > last_modified {
196 last_modified = mtime;
197 }
198 }
199 }
200 Err(e) => {
201 eprintln!("ix: warning: stale check skipping path: {e}");
202 }
203 }
204 }
205 Ok(last_modified)
206 }
207
208 pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
211 let count = self.header.trigram_count as usize;
212 let table_start = self.header.trigram_table_offset as usize;
213
214 let mut low = 0;
215 let mut high = count;
216
217 while low < high {
218 let mid = low + (high - low) / 2;
219 let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;
220
221 let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
222 let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
223
224 match key.cmp(&trigram) {
225 std::cmp::Ordering::Equal => {
226 let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;
227
228 let mut off_bytes = [0u8; 8];
229 off_bytes[..6].copy_from_slice(&entry[4..10]);
230 let posting_offset = u64::from_le_bytes(off_bytes);
231
232 let posting_length = entry
233 .get(10..14)
234 .and_then(|s| s.try_into().ok())
235 .map(u32::from_le_bytes)?;
236
237 let doc_frequency = entry
238 .get(14..18)
239 .and_then(|s| s.try_into().ok())
240 .map(u32::from_le_bytes)?;
241
242 return Some(TrigramInfo {
243 posting_offset,
244 posting_length,
245 doc_frequency,
246 });
247 }
248 std::cmp::Ordering::Less => low = mid + 1,
249 std::cmp::Ordering::Greater => high = mid,
250 }
251 }
252
253 None
254 }
255
256 pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
262 let start = info.posting_offset as usize;
263 let end = start + info.posting_length as usize;
264 if end > self.mmap.len() {
265 return Err(Error::PostingOutOfBounds);
266 }
267 PostingList::decode(&self.mmap[start..end])
268 }
269
270 pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
277 if file_id >= self.header.file_count {
278 return Err(Error::FileIdOutOfBounds(file_id));
279 }
280
281 let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
282 let entry = self
283 .mmap
284 .get(entry_off..entry_off + FILE_ENTRY_SIZE)
285 .ok_or(Error::SectionOutOfBounds {
286 section: "file_entry",
287 offset: entry_off as u64,
288 size: FILE_ENTRY_SIZE as u64,
289 file_len: self.mmap.len() as u64,
290 })?;
291
292 let path_off = u32::from_le_bytes(
293 entry[4..8]
294 .try_into()
295 .map_err(|_| Error::Config("invalid path offset".into()))?,
296 );
297 let status = FileStatus::from_u8(entry[10]);
298 let mtime_ns = u64::from_le_bytes(
299 entry[12..20]
300 .try_into()
301 .map_err(|_| Error::Config("invalid mtime".into()))?,
302 );
303 let size_bytes = u64::from_le_bytes(
304 entry[20..28]
305 .try_into()
306 .map_err(|_| Error::Config("invalid size".into()))?,
307 );
308 let content_hash = u64::from_le_bytes(
309 entry[28..36]
310 .try_into()
311 .map_err(|_| Error::Config("invalid hash".into()))?,
312 );
313
314 let path = self.string_pool.resolve(path_off)?;
315
316 Ok(FileInfo {
317 file_id,
318 path: PathBuf::from(path),
319 status,
320 mtime_ns,
321 size_bytes,
322 content_hash,
323 })
324 }
325
326 #[must_use]
333 pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
334 if !self.header.has_bloom() {
335 return true;
336 }
337
338 let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
339 let Some(bloom_bytes) = self.mmap.get(entry_off + 40..entry_off + 44) else {
340 return true;
341 };
342
343 let bloom_rel_off = u32::from_le_bytes(
344 bloom_bytes
345 .try_into()
346 .expect("bloom_bytes is exactly 4 bytes"),
347 );
348 let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
349
350 let Some(size_bytes) = self.mmap.get(bloom_abs_off..bloom_abs_off + 2) else {
351 return true;
352 };
353 let size = u16::from_le_bytes(
354 size_bytes
355 .try_into()
356 .expect("size_bytes is exactly 2 bytes"),
357 ) as usize;
358
359 let num_hashes = self.mmap.get(bloom_abs_off + 2).copied().unwrap_or(0);
360 let Some(bits) = self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) else {
361 return true;
362 };
363
364 BloomFilter::slice_contains(bits, num_hashes, trigram)
365 }
366
367 #[must_use]
369 pub const fn metadata(&self) -> ShardMetadata {
370 ShardMetadata {
371 shard_timestamp: self.header.created_at,
372 file_count: self.header.file_count,
373 trigram_count: self.header.trigram_count,
374 }
375 }
376
377 #[must_use]
386 pub fn is_stale(&self, path: &Path) -> bool {
387 let Ok(current) = std::fs::metadata(path) else {
388 return true;
389 };
390
391 if current.len() as usize != self.mmap.len() {
392 return true;
393 }
394
395 #[cfg(unix)]
396 {
397 if let Some(stored_inode) = self.inode
398 && current.ino() != stored_inode
399 {
400 return true;
401 }
402 }
403
404 false
405 }
406}