1use std::convert::TryInto;
2use std::fs::File;
3use std::io::{self, BufRead, BufReader, BufWriter, Write};
4use std::ops::{Deref, Index};
5use std::path::Path;
6
7#[derive(Debug, Clone)]
16pub struct PayloadVector {
17 data: Vec<u8>,
18}
19
20impl AsRef<[u8]> for PayloadVector {
21 fn as_ref(&self) -> &[u8] {
22 &self.data
23 }
24}
25
26impl AsRef<PayloadSlice> for PayloadVector {
27 fn as_ref(&self) -> &PayloadSlice {
28 self
29 }
30}
31
32impl<Item> std::iter::FromIterator<Item> for PayloadVector
33where
34 Item: AsRef<[u8]>,
35{
36 fn from_iter<T: IntoIterator<Item = Item>>(iter: T) -> Self {
37 let mut data = Vec::<u8>::new();
38 let mut length: u64 = 0;
39
40 data.extend(length.to_le_bytes());
42
43 let mut payloads = Vec::<u8>::new();
45 let mut offset: u64 = 0;
46
47 data.extend(offset.to_le_bytes());
48 for item in iter {
49 let bytes: &[u8] = item.as_ref();
50 payloads.extend(bytes);
51 offset += bytes.len() as u64;
52 length += 1;
53 data.extend(offset.to_le_bytes());
54 }
55
56 data.extend(payloads);
57 data[..std::mem::size_of::<u64>()].copy_from_slice(&length.to_le_bytes());
58
59 Self { data }
60 }
61}
62
63impl Deref for PayloadVector {
64 type Target = PayloadSlice;
65 fn deref(&self) -> &Self::Target {
66 let data: *const [u8] = &*self.data;
67 let data = data as *const PayloadSlice;
68 unsafe { &*data }
72 }
73}
74
75#[repr(transparent)]
140pub struct PayloadSlice {
141 data: [u8],
142}
143
144impl AsRef<PayloadSlice> for PayloadSlice {
145 fn as_ref(&self) -> &PayloadSlice {
146 self
147 }
148}
149
150impl Index<usize> for &'_ PayloadSlice {
151 type Output = [u8];
152 fn index(&self, index: usize) -> &Self::Output {
153 if let Some(payload) = self.get(index as u64) {
154 payload
155 } else {
156 panic!("index out of bounds: {}", index)
157 }
158 }
159}
160
161impl Index<usize> for PayloadVector {
162 type Output = [u8];
163 fn index(&self, index: usize) -> &Self::Output {
164 if let Some(payload) = self.get(index as u64) {
165 payload
166 } else {
167 panic!("index out of bounds: {}", index)
168 }
169 }
170}
171
172impl PayloadSlice {
173 #[must_use]
175 pub fn new(data: &[u8]) -> &Self {
176 let data: *const [u8] = data;
177 let data = data as *const PayloadSlice;
178 unsafe { &*data }
182 }
183
184 pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
190 writer.write_all(&self.data)?;
191 writer.flush()
192 }
193
194 #[must_use]
196 pub fn get(&self, index: u64) -> Option<&[u8]> {
197 if index >= self.len() {
198 None
199 } else {
200 let payloads_offset = (self.len() as usize + 2) * 8;
201 let offset_pos = (index as usize + 1) * 8;
202 let offset = payloads_offset + self.int_at(offset_pos) as usize;
203 let next_offset = payloads_offset + self.int_at(offset_pos + 8) as usize;
204 self.data.get(offset..next_offset)
205 }
206 }
207
208 #[must_use]
210 pub fn len(&self) -> u64 {
211 self.int_at(0)
212 }
213
214 #[must_use]
216 pub fn is_empty(&self) -> bool {
217 self.len() == 0
218 }
219
220 pub fn iter(&self) -> impl Iterator<Item = &[u8]> {
222 PayloadIter {
223 slice: self,
224 index: 0,
225 length: self.len(),
226 }
227 }
228
229 fn int_at(&self, offset: usize) -> u64 {
230 u64::from_le_bytes(self.data[offset..offset + 8].try_into().unwrap())
231 }
232}
233
234pub struct PayloadIter<'a> {
236 slice: &'a PayloadSlice,
237 index: u64,
238 length: u64,
239}
240
241impl<'a> Iterator for PayloadIter<'a> {
242 type Item = &'a [u8];
243 fn next(&mut self) -> Option<Self::Item> {
244 if self.index == self.length {
245 None
246 } else {
247 let item = self.slice.get(self.index);
248 self.index += 1;
249 item
250 }
251 }
252}
253
254pub fn build_lexicon(input: &Path, output: &Path) -> io::Result<()> {
261 let lex = BufReader::new(File::open(input)?)
262 .lines()
263 .collect::<Result<PayloadVector, _>>()?;
264 let mut lex_path = BufWriter::new(File::create(output)?);
265 lex.write(&mut lex_path)?;
266 lex_path.flush()?;
267 Ok(())
268}
269
270#[cfg(test)]
271mod test {
272 use super::*;
273 use std::io;
274 use std::path::PathBuf;
275 use tempfile::TempDir;
276
277 #[test]
278 #[cfg(not(miri))]
279 fn test_write() -> io::Result<()> {
280 let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data");
281 let tmp = TempDir::new()?;
282 let output = tmp.path().join("terms.lex");
283 build_lexicon(&test_data_dir.join("terms.txt"), &output)?;
284 let actual_lex_bytes = std::fs::read(output)?;
285 let expected_lex_bytes = std::fs::read(test_data_dir.join("terms.lex"))?;
286 assert_eq!(actual_lex_bytes, expected_lex_bytes);
287 Ok(())
288 }
289
290 #[test]
291 #[cfg(not(miri))]
292 fn test_elements() -> io::Result<()> {
293 let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data");
294 let lex: PayloadVector = std::fs::read_to_string(test_data_dir.join("terms.txt"))?
295 .split_whitespace()
296 .map(str::to_string)
297 .collect();
298
299 assert_eq!(lex.get(0), Some(b"aardvark".as_ref()));
300 assert_eq!(lex.get(1), Some(b"cat".as_ref()));
301 assert_eq!(lex.get(2), Some(b"dog".as_ref()));
302 assert_eq!(lex.get(3), Some(b"gnu".as_ref()));
303 assert_eq!(lex.get(4), Some(b"mouse".as_ref()));
304 assert_eq!(lex.get(5), Some(b"zebra".as_ref()));
305
306 assert_eq!(&lex[0], b"aardvark".as_ref());
307 assert_eq!(&lex[1], b"cat".as_ref());
308 assert_eq!(&lex[2], b"dog".as_ref());
309 assert_eq!(&lex[3], b"gnu".as_ref());
310 assert_eq!(&lex[4], b"mouse".as_ref());
311 assert_eq!(&lex[5], b"zebra".as_ref());
312
313 let expected = vec![
314 b"aardvark".as_ref(),
315 b"cat".as_ref(),
316 b"dog".as_ref(),
317 b"gnu".as_ref(),
318 b"mouse".as_ref(),
319 b"zebra".as_ref(),
320 ];
321 assert_eq!(lex.iter().collect::<Vec<_>>(), expected);
322
323 Ok(())
324 }
325
326 fn assert_payloads<L: AsRef<PayloadSlice>>(lex: L, payloads: &[&[u8]]) {
327 let lex = lex.as_ref();
328 assert!(!lex.is_empty());
329 for (idx, payload) in payloads.iter().enumerate() {
330 assert_eq!(lex.get(idx as u64), Some(*payload));
331 assert_eq!(&lex[idx], *payload);
332 }
333 assert!(lex.get(6).is_none());
334 assert_eq!(lex.iter().collect::<Vec<_>>(), payloads);
335 }
336
337 #[test]
338 fn test_element_access() {
339 let payloads = vec![
340 b"aardvark".as_ref(),
341 b"cat".as_ref(),
342 b"dog".as_ref(),
343 b"gnu".as_ref(),
344 b"mouse".as_ref(),
345 b"zebra".as_ref(),
346 ];
347 let lex: PayloadVector = payloads
348 .iter()
349 .map(|&b| String::from_utf8(Vec::from(b)).unwrap())
350 .collect();
351 assert_payloads(&lex, &payloads);
352 assert_payloads(PayloadSlice::new(lex.as_ref()), &payloads);
353 }
354}