Skip to main content

ix/
reader.rs

1//! Index reader — the mmap-based query-time interface.
2//!
3//! Fast, zero-copy access to the index data.
4
5use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::*;
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16pub struct Reader {
17    mmap: Mmap,
18    pub header: Header,
19    string_pool: StringPoolReader<'static>,
20}
21
22#[derive(Debug)]
23pub struct TrigramInfo {
24    pub posting_offset: u64,
25    pub posting_length: u32,
26    pub doc_frequency: u32,
27}
28
29#[derive(Debug)]
30pub struct FileInfo {
31    pub file_id: u32,
32    pub path: PathBuf,
33    pub status: FileStatus,
34    pub mtime_ns: u64,
35    pub size_bytes: u64,
36    pub content_hash: u64,
37}
38
39impl Reader {
40    pub fn open(path: &Path) -> Result<Self> {
41        let file = File::open(path)?;
42        let mmap = unsafe { Mmap::map(&file)? };
43
44        if mmap.len() < HEADER_SIZE {
45            return Err(Error::IndexTooSmall);
46        }
47
48        let header = Header::parse(&mmap[0..HEADER_SIZE])?;
49        header.validate_bounds(mmap.len() as u64)?;
50
51        // Safety: we are extending the lifetime of the slice to 'static.
52        // This is okay because 'Reader' owns the 'Mmap' which owns the data.
53        let string_pool_data: &'static [u8] = unsafe {
54            let slice = &mmap[header.string_pool_offset as usize
55                ..(header.string_pool_offset + header.string_pool_size) as usize];
56            std::mem::transmute(slice)
57        };
58        let string_pool = StringPoolReader::new(string_pool_data)?;
59
60        Ok(Self {
61            mmap,
62            header,
63            string_pool,
64        })
65    }
66
67    pub fn get_last_modified(root: &Path) -> Result<u64> {
68        let mut last_modified = 0u64;
69        let walker = ignore::WalkBuilder::new(root)
70            .hidden(false)
71            .git_ignore(true)
72            .require_git(false)
73            .add_custom_ignore_filename(".ixignore")
74            .filter_entry(move |entry| {
75                let path = entry.path();
76                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) {
77                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
78                    if name == "target" || name == ".git" || name == "node_modules" || name == ".ix"
79                    {
80                        return false;
81                    }
82                }
83                true
84            })
85            .build();
86
87        for result in walker {
88            let entry = result.map_err(|e| Error::Config(e.to_string()))?;
89            if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
90                let metadata = entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
91                let mtime = metadata
92                    .modified()
93                    .and_then(|t| {
94                        t.duration_since(UNIX_EPOCH)
95                            .map_err(|_| std::io::Error::other("time went backwards"))
96                    })
97                    .map(|d| d.as_micros() as u64)
98                    .unwrap_or(0);
99                if mtime > last_modified {
100                    last_modified = mtime;
101                }
102            }
103        }
104        Ok(last_modified)
105    }
106
107    pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
108        let count = self.header.trigram_count as usize;
109        let table_start = self.header.trigram_table_offset as usize;
110
111        let mut low = 0;
112        let mut high = count;
113
114        while low < high {
115            let mid = low + (high - low) / 2;
116            let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;
117
118            // Read trigram key (first 4 bytes)
119            let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
120            let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
121
122            if key == trigram {
123                let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;
124
125                // Read posting_offset (u48, bytes 4..10)
126                let mut off_bytes = [0u8; 8];
127                off_bytes[..6].copy_from_slice(&entry[4..10]);
128                let posting_offset = u64::from_le_bytes(off_bytes);
129
130                let posting_length = entry
131                    .get(10..14)
132                    .and_then(|s| s.try_into().ok())
133                    .map(u32::from_le_bytes)
134                    .unwrap_or(0);
135                let doc_frequency = entry
136                    .get(14..18)
137                    .and_then(|s| s.try_into().ok())
138                    .map(u32::from_le_bytes)
139                    .unwrap_or(0);
140
141                return Some(TrigramInfo {
142                    posting_offset,
143                    posting_length,
144                    doc_frequency,
145                });
146            } else if key < trigram {
147                low = mid + 1;
148            } else {
149                high = mid;
150            }
151        }
152
153        None
154    }
155
156    pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
157        let start = info.posting_offset as usize;
158        let end = start + info.posting_length as usize;
159        if end > self.mmap.len() {
160            return Err(Error::PostingOutOfBounds);
161        }
162        PostingList::decode(&self.mmap[start..end])
163    }
164
165    pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
166        if file_id >= self.header.file_count {
167            return Err(Error::FileIdOutOfBounds(file_id));
168        }
169
170        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
171        let entry = self
172            .mmap
173            .get(entry_off..entry_off + FILE_ENTRY_SIZE)
174            .ok_or(Error::SectionOutOfBounds {
175                section: "file_entry",
176                offset: entry_off as u64,
177                size: FILE_ENTRY_SIZE as u64,
178                file_len: self.mmap.len() as u64,
179            })?;
180
181        let path_off = u32::from_le_bytes(
182            entry[4..8]
183                .try_into()
184                .map_err(|_| Error::Config("invalid path offset".into()))?,
185        );
186        let status = FileStatus::from_u8(entry[10]);
187        let mtime_ns = u64::from_le_bytes(
188            entry[12..20]
189                .try_into()
190                .map_err(|_| Error::Config("invalid mtime".into()))?,
191        );
192        let size_bytes = u64::from_le_bytes(
193            entry[20..28]
194                .try_into()
195                .map_err(|_| Error::Config("invalid size".into()))?,
196        );
197        let content_hash = u64::from_le_bytes(
198            entry[28..36]
199                .try_into()
200                .map_err(|_| Error::Config("invalid hash".into()))?,
201        );
202
203        let path = self.string_pool.resolve(path_off)?;
204
205        Ok(FileInfo {
206            file_id,
207            path: PathBuf::from(path),
208            status,
209            mtime_ns,
210            size_bytes,
211            content_hash,
212        })
213    }
214
215    pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
216        if !self.header.has_bloom() {
217            return true;
218        }
219
220        let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
221        let bloom_rel_off_bytes = self.mmap.get(entry_off + 40..entry_off + 44);
222        if bloom_rel_off_bytes.is_none() {
223            return true;
224        }
225
226        let bloom_rel_off = bloom_rel_off_bytes
227            .and_then(|b| b.try_into().ok())
228            .map(u32::from_le_bytes)
229            .unwrap_or(0);
230
231        let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
232        if bloom_abs_off + 4 > self.mmap.len() {
233            return true;
234        }
235
236        let size = u16::from_le_bytes(
237            self.mmap[bloom_abs_off..bloom_abs_off + 2]
238                .try_into()
239                .unwrap_or([0u8; 2]),
240        ) as usize;
241        let num_hashes = self.mmap[bloom_abs_off + 2];
242        let bits = match self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) {
243            Some(b) => b,
244            None => return true,
245        };
246
247        BloomFilter::slice_contains(bits, num_hashes, trigram)
248    }
249}