sift_core/storage/
lexicon.rs1use std::fs::File;
4use std::io::{BufWriter, Write};
5use std::path::Path;
6
7use memmap2::Mmap;
8
9use crate::storage::format::{write_magic, LEXICON_MAGIC};
10use crate::storage::mmap::open_mmap;
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct LexiconEntry {
14 pub trigram: [u8; 3],
15 pub offset: u64,
16 pub len: u32,
17}
18
19pub fn write_lexicon(out_path: &Path, entries: &[LexiconEntry]) -> std::io::Result<()> {
25 let f = File::create(out_path)?;
26 let mut w = BufWriter::new(f);
27 write_magic(&mut w, LEXICON_MAGIC)?;
28 let n: u32 = entries
29 .len()
30 .try_into()
31 .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidInput, "lexicon too large"))?;
32 w.write_all(&n.to_le_bytes())?;
33 for e in entries {
34 w.write_all(&e.trigram)?;
35 w.write_all(&e.offset.to_le_bytes())?;
36 w.write_all(&e.len.to_le_bytes())?;
37 }
38 w.flush()?;
39 Ok(())
40}
41
42#[derive(Debug)]
46pub struct MappedLexicon {
47 backing: Backing,
48 count: usize,
49}
50
51#[derive(Debug)]
52enum Backing {
53 Mmap(Mmap),
54 Owned(Vec<u8>),
55}
56
57impl MappedLexicon {
58 const ENTRY_SIZE: usize = 15;
59
60 fn bytes(&self) -> &[u8] {
61 match &self.backing {
62 Backing::Mmap(m) => m.as_ref(),
63 Backing::Owned(v) => v.as_slice(),
64 }
65 }
66
67 #[must_use]
73 pub fn from_entries(entries: &[LexiconEntry]) -> Self {
74 let count = entries.len();
75 let size = LEXICON_MAGIC.len() + 4 + count * Self::ENTRY_SIZE;
76 let mut data = vec![0u8; size];
77 let mut cursor = 0;
78
79 data[cursor..cursor + LEXICON_MAGIC.len()].copy_from_slice(&LEXICON_MAGIC);
80 cursor += LEXICON_MAGIC.len();
81
82 data[cursor..cursor + 4].copy_from_slice(&u32::try_from(count).unwrap().to_le_bytes());
83 cursor += 4;
84
85 for e in entries {
86 data[cursor..cursor + 3].copy_from_slice(&e.trigram);
87 cursor += 3;
88 data[cursor..cursor + 8].copy_from_slice(&e.offset.to_le_bytes());
89 cursor += 8;
90 data[cursor..cursor + 4].copy_from_slice(&e.len.to_le_bytes());
91 cursor += 4;
92 }
93
94 Self {
95 backing: Backing::Owned(data),
96 count,
97 }
98 }
99
100 pub fn open(path: &Path) -> std::io::Result<Self> {
106 let mmap = open_mmap(path)?;
107 let bytes = mmap.as_ref();
108 let count = Self::validate(bytes)?;
109 Ok(Self {
110 backing: Backing::Mmap(mmap),
111 count,
112 })
113 }
114
115 fn validate(bytes: &[u8]) -> std::io::Result<usize> {
116 let magic_len = LEXICON_MAGIC.len();
117 if bytes.len() < magic_len + 4 {
118 return Err(std::io::Error::new(
119 std::io::ErrorKind::InvalidData,
120 "lexicon too short for magic+count",
121 ));
122 }
123 if bytes[..magic_len] != LEXICON_MAGIC {
124 return Err(std::io::Error::new(
125 std::io::ErrorKind::InvalidData,
126 "unexpected lexicon magic",
127 ));
128 }
129 let n = u32::from_le_bytes(bytes[magic_len..magic_len + 4].try_into().unwrap()) as usize;
130 let expected_bytes = n * Self::ENTRY_SIZE;
131 if bytes.len() < magic_len + 4 + expected_bytes {
132 return Err(std::io::Error::new(
133 std::io::ErrorKind::InvalidData,
134 "lexicon truncated",
135 ));
136 }
137 Ok(n)
138 }
139
140 #[must_use]
141 pub const fn len(&self) -> usize {
142 self.count
143 }
144
145 #[must_use]
146 pub const fn is_empty(&self) -> bool {
147 self.count == 0
148 }
149
150 #[must_use]
156 pub fn get(&self, tri: [u8; 3]) -> Option<LexiconEntry> {
157 if self.count == 0 {
158 return None;
159 }
160 let bytes = self.bytes();
161 let magic_len = LEXICON_MAGIC.len();
162 let data_start = magic_len + 4;
163
164 let mut lo = 0;
165 let mut hi = self.count;
166
167 while lo < hi {
168 let mid = lo + (hi - lo) / 2;
169 let offset = data_start + mid * Self::ENTRY_SIZE;
170 let entry_tri: [u8; 3] = bytes[offset..offset + 3].try_into().unwrap();
171
172 match entry_tri.cmp(&tri) {
173 std::cmp::Ordering::Less => lo = mid + 1,
174 std::cmp::Ordering::Greater => hi = mid,
175 std::cmp::Ordering::Equal => {
176 let off =
177 u64::from_le_bytes(bytes[offset + 3..offset + 11].try_into().unwrap());
178 let len =
179 u32::from_le_bytes(bytes[offset + 11..offset + 15].try_into().unwrap());
180 return Some(LexiconEntry {
181 trigram: entry_tri,
182 offset: off,
183 len,
184 });
185 }
186 }
187 }
188 None
189 }
190
191 #[must_use]
192 pub const fn iter(&self) -> MappedLexiconIter<'_> {
193 MappedLexiconIter {
194 lexicon: self,
195 pos: 0,
196 }
197 }
198
199 #[must_use]
200 pub fn backing_slice(&self) -> &[u8] {
201 self.bytes()
202 }
203}
204
205impl<'a> IntoIterator for &'a MappedLexicon {
206 type Item = LexiconEntry;
207 type IntoIter = MappedLexiconIter<'a>;
208
209 fn into_iter(self) -> Self::IntoIter {
210 MappedLexiconIter {
211 lexicon: self,
212 pos: 0,
213 }
214 }
215}
216
217pub struct MappedLexiconIter<'_a> {
218 lexicon: &'_a MappedLexicon,
219 pos: usize,
220}
221
222impl Iterator for MappedLexiconIter<'_> {
223 type Item = LexiconEntry;
224
225 fn next(&mut self) -> Option<Self::Item> {
226 if self.pos >= self.lexicon.count {
227 return None;
228 }
229 let bytes = self.lexicon.bytes();
230 let magic_len = LEXICON_MAGIC.len();
231 let offset = magic_len + 4 + self.pos * MappedLexicon::ENTRY_SIZE;
232 let tri: [u8; 3] = bytes[offset..offset + 3].try_into().unwrap();
233 let off = u64::from_le_bytes(bytes[offset + 3..offset + 11].try_into().unwrap());
234 let len = u32::from_le_bytes(bytes[offset + 11..offset + 15].try_into().unwrap());
235 self.pos += 1;
236 Some(LexiconEntry {
237 trigram: tri,
238 offset: off,
239 len,
240 })
241 }
242}