1use crate::bloom::BloomFilter;
6use crate::error::{Error, Result};
7use crate::format::*;
8use crate::posting::PostingList;
9use crate::string_pool::StringPoolReader;
10use crate::trigram::Trigram;
11use memmap2::Mmap;
12use std::fs::File;
13use std::path::{Path, PathBuf};
14use std::time::UNIX_EPOCH;
15
16pub struct Reader {
17 mmap: Mmap,
18 pub header: Header,
19 string_pool: StringPoolReader<'static>,
20}
21
22#[derive(Debug)]
23pub struct TrigramInfo {
24 pub posting_offset: u64,
25 pub posting_length: u32,
26 pub doc_frequency: u32,
27}
28
29#[derive(Debug)]
30pub struct FileInfo {
31 pub file_id: u32,
32 pub path: PathBuf,
33 pub status: FileStatus,
34 pub mtime_ns: u64,
35 pub size_bytes: u64,
36 pub content_hash: u64,
37}
38
39impl Reader {
40 pub fn open(path: &Path) -> Result<Self> {
41 let file = File::open(path)?;
42 let mmap = unsafe { Mmap::map(&file)? };
43
44 if mmap.len() < HEADER_SIZE {
45 return Err(Error::IndexTooSmall);
46 }
47
48 let header = Header::parse(&mmap[0..HEADER_SIZE])?;
49 header.validate_bounds(mmap.len() as u64)?;
50
51 let string_pool_data: &'static [u8] = unsafe {
54 let slice = &mmap[header.string_pool_offset as usize
55 ..(header.string_pool_offset + header.string_pool_size) as usize];
56 std::mem::transmute(slice)
57 };
58 let string_pool = StringPoolReader::new(string_pool_data)?;
59
60 Ok(Self {
61 mmap,
62 header,
63 string_pool,
64 })
65 }
66
67 pub fn get_last_modified(root: &Path) -> Result<u64> {
68 let mut last_modified = 0u64;
69 let walker = ignore::WalkBuilder::new(root)
70 .hidden(false)
71 .git_ignore(true)
72 .require_git(false)
73 .add_custom_ignore_filename(".ixignore")
74 .filter_entry(move |entry| {
75 let path = entry.path();
76 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
77
78 if entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
80 && (name == "lost+found" || name == ".git" || name == "node_modules" ||
81 name == "target" || name == "__pycache__" || name == ".tox" ||
82 name == ".venv" || name == "venv" || name == ".ix")
83 {
84 return false;
85 }
86
87 if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
89 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
90 match ext {
91 "so" | "o" | "dylib" | "a" | "dll" | "exe" | "pyc" |
93 "jpg" | "png" | "gif" | "mp4" | "mp3" | "pdf" |
95 "zip" | "7z" | "rar" |
97 "sqlite" | "db" | "bin" => return false,
99 _ => {}
100 }
101 if name.ends_with(".tar.gz") {
102 return false;
103 }
104 }
105 true
106 })
107 .build();
108
109 for result in walker {
110 match result {
111 Ok(entry) => {
112 if entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
113 let metadata = entry.metadata().map_err(|e| Error::Config(e.to_string()))?;
114 let mtime = metadata
115 .modified()
116 .and_then(|t| {
117 t.duration_since(UNIX_EPOCH)
118 .map_err(|_| std::io::Error::other("time went backwards"))
119 })
120 .map(|d| d.as_micros() as u64)
121 .unwrap_or(0);
122 if mtime > last_modified {
123 last_modified = mtime;
124 }
125 }
126 }
127 Err(e) => {
128 eprintln!("ix: warning: stale check skipping path: {}", e);
129 }
130 }
131 }
132 Ok(last_modified)
133 }
134
135 pub fn get_trigram(&self, trigram: Trigram) -> Option<TrigramInfo> {
136 let count = self.header.trigram_count as usize;
137 let table_start = self.header.trigram_table_offset as usize;
138
139 let mut low = 0;
140 let mut high = count;
141
142 while low < high {
143 let mid = low + (high - low) / 2;
144 let entry_off = table_start + mid * TRIGRAM_ENTRY_SIZE;
145
146 let key_bytes = self.mmap.get(entry_off..entry_off + 4)?;
148 let key = u32::from_le_bytes(key_bytes.try_into().ok()?);
149
150 if key == trigram {
151 let entry = self.mmap.get(entry_off..entry_off + TRIGRAM_ENTRY_SIZE)?;
152
153 let mut off_bytes = [0u8; 8];
155 off_bytes[..6].copy_from_slice(&entry[4..10]);
156 let posting_offset = u64::from_le_bytes(off_bytes);
157
158 let posting_length = entry
159 .get(10..14)
160 .and_then(|s| s.try_into().ok())
161 .map(u32::from_le_bytes)
162 .unwrap_or(0);
163 let doc_frequency = entry
164 .get(14..18)
165 .and_then(|s| s.try_into().ok())
166 .map(u32::from_le_bytes)
167 .unwrap_or(0);
168
169 return Some(TrigramInfo {
170 posting_offset,
171 posting_length,
172 doc_frequency,
173 });
174 } else if key < trigram {
175 low = mid + 1;
176 } else {
177 high = mid;
178 }
179 }
180
181 None
182 }
183
184 pub fn decode_postings(&self, info: &TrigramInfo) -> Result<PostingList> {
185 let start = info.posting_offset as usize;
186 let end = start + info.posting_length as usize;
187 if end > self.mmap.len() {
188 return Err(Error::PostingOutOfBounds);
189 }
190 PostingList::decode(&self.mmap[start..end])
191 }
192
193 pub fn get_file(&self, file_id: u32) -> Result<FileInfo> {
194 if file_id >= self.header.file_count {
195 return Err(Error::FileIdOutOfBounds(file_id));
196 }
197
198 let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
199 let entry = self
200 .mmap
201 .get(entry_off..entry_off + FILE_ENTRY_SIZE)
202 .ok_or(Error::SectionOutOfBounds {
203 section: "file_entry",
204 offset: entry_off as u64,
205 size: FILE_ENTRY_SIZE as u64,
206 file_len: self.mmap.len() as u64,
207 })?;
208
209 let path_off = u32::from_le_bytes(
210 entry[4..8]
211 .try_into()
212 .map_err(|_| Error::Config("invalid path offset".into()))?,
213 );
214 let status = FileStatus::from_u8(entry[10]);
215 let mtime_ns = u64::from_le_bytes(
216 entry[12..20]
217 .try_into()
218 .map_err(|_| Error::Config("invalid mtime".into()))?,
219 );
220 let size_bytes = u64::from_le_bytes(
221 entry[20..28]
222 .try_into()
223 .map_err(|_| Error::Config("invalid size".into()))?,
224 );
225 let content_hash = u64::from_le_bytes(
226 entry[28..36]
227 .try_into()
228 .map_err(|_| Error::Config("invalid hash".into()))?,
229 );
230
231 let path = self.string_pool.resolve(path_off)?;
232
233 Ok(FileInfo {
234 file_id,
235 path: PathBuf::from(path),
236 status,
237 mtime_ns,
238 size_bytes,
239 content_hash,
240 })
241 }
242
243 pub fn bloom_may_contain(&self, file_id: u32, trigram: Trigram) -> bool {
244 if !self.header.has_bloom() {
245 return true;
246 }
247
248 let entry_off = self.header.file_table_offset as usize + file_id as usize * FILE_ENTRY_SIZE;
249 let bloom_rel_off_bytes = self.mmap.get(entry_off + 40..entry_off + 44);
250 if bloom_rel_off_bytes.is_none() {
251 return true;
252 }
253
254 let bloom_rel_off = bloom_rel_off_bytes
255 .and_then(|b| b.try_into().ok())
256 .map(u32::from_le_bytes)
257 .unwrap_or(0);
258
259 let bloom_abs_off = self.header.bloom_offset as usize + bloom_rel_off as usize;
260 if bloom_abs_off + 4 > self.mmap.len() {
261 return true;
262 }
263
264 let size = u16::from_le_bytes(
265 self.mmap[bloom_abs_off..bloom_abs_off + 2]
266 .try_into()
267 .unwrap_or([0u8; 2]),
268 ) as usize;
269 let num_hashes = self.mmap[bloom_abs_off + 2];
270 let bits = match self.mmap.get(bloom_abs_off + 4..bloom_abs_off + 4 + size) {
271 Some(b) => b,
272 None => return true,
273 };
274
275 BloomFilter::slice_contains(bits, num_hashes, trigram)
276 }
277}