ciff/
payload_vector.rs

1use std::convert::TryInto;
2use std::fs::File;
3use std::io::{self, BufRead, BufReader, BufWriter, Write};
4use std::ops::{Deref, Index};
5use std::path::Path;
6
7/// Owning variant of [`PayloadSlice`], in which the underlying bytes are fully
8/// in memory within the struct. This is useful mainly for building the structure
9/// before writing it to a file, but also if one decides to fully load the bytes
10/// to memory and use it to assess elements without parsing the whole vector
11/// to a `Vec`.
12///
13/// `PayloadVector` implements `Deref<Target = PayloadSlice>`. See [`PayloadSlice`]
14/// for all the methods supported through dereferencing.
15#[derive(Debug, Clone)]
16pub struct PayloadVector {
17    data: Vec<u8>,
18}
19
20impl AsRef<[u8]> for PayloadVector {
21    fn as_ref(&self) -> &[u8] {
22        &self.data
23    }
24}
25
26impl AsRef<PayloadSlice> for PayloadVector {
27    fn as_ref(&self) -> &PayloadSlice {
28        self
29    }
30}
31
32impl<Item> std::iter::FromIterator<Item> for PayloadVector
33where
34    Item: AsRef<[u8]>,
35{
36    fn from_iter<T: IntoIterator<Item = Item>>(iter: T) -> Self {
37        let mut data = Vec::<u8>::new();
38        let mut length: u64 = 0;
39
40        // Write empty bytes that will be modified once length is known.
41        data.extend(length.to_le_bytes());
42
43        // Must collect separately, to later append to `data`.
44        let mut payloads = Vec::<u8>::new();
45        let mut offset: u64 = 0;
46
47        data.extend(offset.to_le_bytes());
48        for item in iter {
49            let bytes: &[u8] = item.as_ref();
50            payloads.extend(bytes);
51            offset += bytes.len() as u64;
52            length += 1;
53            data.extend(offset.to_le_bytes());
54        }
55
56        data.extend(payloads);
57        data[..std::mem::size_of::<u64>()].copy_from_slice(&length.to_le_bytes());
58
59        Self { data }
60    }
61}
62
63impl Deref for PayloadVector {
64    type Target = PayloadSlice;
65    fn deref(&self) -> &Self::Target {
66        let data: *const [u8] = &*self.data;
67        let data = data as *const PayloadSlice;
68        // SAFETY: PayloadSlice just wraps [u8],
69        // and &*self.data is &[u8], therefore
70        // transmuting &[u8] to &PayloadSlice is safe.
71        unsafe { &*data }
72    }
73}
74
75/// Payload slice is a slice of variable-sized elements (payloads) encoded in
76/// a single block of memory. This way, sequences of, say, strings, can be indexed
77/// into without loading all the elements in memory, but rather using a memory
78/// mapped buffer.
79///
80/// # Use case
81///
82/// The primary use case of this struct is not necessarily to limit the bytes
83/// that are loaded in memory, but rather to limit the time to initialize it.
84/// For example, one can use it in a command line program to quickly look up
85/// one or several values from a file-encoded vector, without the overhead of
86/// parsing the entire file and loading it in memory.
87///
88/// # Examples
89///
90/// ```
91/// # use std::fs::File;
92/// # use ciff::{PayloadSlice, PayloadVector};
93/// # use tempfile::TempDir;
94/// use memmap::Mmap;
95/// # fn main() -> anyhow::Result<()> {
96///
97/// // We will store out vector to a temporary directory
98/// let temp = TempDir::new()?;
99/// let file_path = temp.path().join("words.bin");
100///
101/// // We can use any elements that implement `AsRef<[u8]>`.
102/// let words = vec!["dog", "cat", "gnu"];
103///
104/// // One way of encoding is to collect elements to a PayloadVector in memory
105/// let payloads: PayloadVector = words.into_iter().collect();
106///
107/// // Write to file
108/// let mut output = File::create(&file_path)?;
109/// payloads.write(&mut output);
110/// drop(output);
111///
112/// // Load as payload slice
113/// let input = File::open(&file_path)?;
114/// let bytes = unsafe { Mmap::map(&input)? };
115/// let payloads = PayloadSlice::new(&bytes);
116///
117/// // Note that it returns byte slices.
118/// assert_eq!(&payloads[0], b"dog");
119/// assert_eq!(&payloads[1], b"cat");
120/// assert_eq!(&payloads[2], b"gnu");
121///
122/// // Non-panicing access.
123/// assert_eq!(payloads.get(3), None);
124///
125/// // Collect to a vector of strings
126/// let items: Vec<_> = payloads
127///     .iter()
128///     .map(|b| String::from_utf8(b.to_vec()).unwrap())
129///     .collect();
130/// assert_eq!(items, vec![
131///     "dog".to_string(),
132///     "cat".to_string(),
133///     "gnu".to_string()
134/// ]);
135///
136/// # Ok(())
137/// # }
138/// ```
139#[repr(transparent)]
140pub struct PayloadSlice {
141    data: [u8],
142}
143
144impl AsRef<PayloadSlice> for PayloadSlice {
145    fn as_ref(&self) -> &PayloadSlice {
146        self
147    }
148}
149
150impl Index<usize> for &'_ PayloadSlice {
151    type Output = [u8];
152    fn index(&self, index: usize) -> &Self::Output {
153        if let Some(payload) = self.get(index as u64) {
154            payload
155        } else {
156            panic!("index out of bounds: {}", index)
157        }
158    }
159}
160
161impl Index<usize> for PayloadVector {
162    type Output = [u8];
163    fn index(&self, index: usize) -> &Self::Output {
164        if let Some(payload) = self.get(index as u64) {
165            payload
166        } else {
167            panic!("index out of bounds: {}", index)
168        }
169    }
170}
171
172impl PayloadSlice {
173    /// Conctructs a new slice using the given underlying data.
174    #[must_use]
175    pub fn new(data: &[u8]) -> &Self {
176        let data: *const [u8] = data;
177        let data = data as *const PayloadSlice;
178        // SAFETY: PayloadSlice just wraps [u8],
179        // and &*data is &[u8], therefore
180        // transmuting &[u8] to &PayloadSlice is safe.
181        unsafe { &*data }
182    }
183
184    /// Writes the underlying memory to the output.
185    ///
186    /// # Errors
187    ///
188    /// Will return an error if an error occurs while writing to the output.
189    pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
190        writer.write_all(&self.data)?;
191        writer.flush()
192    }
193
194    /// Returns the element at position `index` or `None` if `index` is out of bounds.
195    #[must_use]
196    pub fn get(&self, index: u64) -> Option<&[u8]> {
197        if index >= self.len() {
198            None
199        } else {
200            let payloads_offset = (self.len() as usize + 2) * 8;
201            let offset_pos = (index as usize + 1) * 8;
202            let offset = payloads_offset + self.int_at(offset_pos) as usize;
203            let next_offset = payloads_offset + self.int_at(offset_pos + 8) as usize;
204            self.data.get(offset..next_offset)
205        }
206    }
207
208    /// Returns the length of the slice.
209    #[must_use]
210    pub fn len(&self) -> u64 {
211        self.int_at(0)
212    }
213
214    /// Checks if the slice is empty.
215    #[must_use]
216    pub fn is_empty(&self) -> bool {
217        self.len() == 0
218    }
219
220    /// Returns the iterator over all items.
221    pub fn iter(&self) -> impl Iterator<Item = &[u8]> {
222        PayloadIter {
223            slice: self,
224            index: 0,
225            length: self.len(),
226        }
227    }
228
229    fn int_at(&self, offset: usize) -> u64 {
230        u64::from_le_bytes(self.data[offset..offset + 8].try_into().unwrap())
231    }
232}
233
234/// Iterator over [`PayloadSlice`].
235pub struct PayloadIter<'a> {
236    slice: &'a PayloadSlice,
237    index: u64,
238    length: u64,
239}
240
241impl<'a> Iterator for PayloadIter<'a> {
242    type Item = &'a [u8];
243    fn next(&mut self) -> Option<Self::Item> {
244        if self.index == self.length {
245            None
246        } else {
247            let item = self.slice.get(self.index);
248            self.index += 1;
249            item
250        }
251    }
252}
253
254/// Builds a lexicon using the text file at `input` and writes it to `output`.
255///
256/// # Errors
257///
258/// Returns an error if any failure occurs during reading the input
259/// or writing to the output.
260pub fn build_lexicon(input: &Path, output: &Path) -> io::Result<()> {
261    let lex = BufReader::new(File::open(input)?)
262        .lines()
263        .collect::<Result<PayloadVector, _>>()?;
264    let mut lex_path = BufWriter::new(File::create(output)?);
265    lex.write(&mut lex_path)?;
266    lex_path.flush()?;
267    Ok(())
268}
269
270#[cfg(test)]
271mod test {
272    use super::*;
273    use std::io;
274    use std::path::PathBuf;
275    use tempfile::TempDir;
276
277    #[test]
278    #[cfg(not(miri))]
279    fn test_write() -> io::Result<()> {
280        let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data");
281        let tmp = TempDir::new()?;
282        let output = tmp.path().join("terms.lex");
283        build_lexicon(&test_data_dir.join("terms.txt"), &output)?;
284        let actual_lex_bytes = std::fs::read(output)?;
285        let expected_lex_bytes = std::fs::read(test_data_dir.join("terms.lex"))?;
286        assert_eq!(actual_lex_bytes, expected_lex_bytes);
287        Ok(())
288    }
289
290    #[test]
291    #[cfg(not(miri))]
292    fn test_elements() -> io::Result<()> {
293        let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data");
294        let lex: PayloadVector = std::fs::read_to_string(test_data_dir.join("terms.txt"))?
295            .split_whitespace()
296            .map(str::to_string)
297            .collect();
298
299        assert_eq!(lex.get(0), Some(b"aardvark".as_ref()));
300        assert_eq!(lex.get(1), Some(b"cat".as_ref()));
301        assert_eq!(lex.get(2), Some(b"dog".as_ref()));
302        assert_eq!(lex.get(3), Some(b"gnu".as_ref()));
303        assert_eq!(lex.get(4), Some(b"mouse".as_ref()));
304        assert_eq!(lex.get(5), Some(b"zebra".as_ref()));
305
306        assert_eq!(&lex[0], b"aardvark".as_ref());
307        assert_eq!(&lex[1], b"cat".as_ref());
308        assert_eq!(&lex[2], b"dog".as_ref());
309        assert_eq!(&lex[3], b"gnu".as_ref());
310        assert_eq!(&lex[4], b"mouse".as_ref());
311        assert_eq!(&lex[5], b"zebra".as_ref());
312
313        let expected = vec![
314            b"aardvark".as_ref(),
315            b"cat".as_ref(),
316            b"dog".as_ref(),
317            b"gnu".as_ref(),
318            b"mouse".as_ref(),
319            b"zebra".as_ref(),
320        ];
321        assert_eq!(lex.iter().collect::<Vec<_>>(), expected);
322
323        Ok(())
324    }
325
326    fn assert_payloads<L: AsRef<PayloadSlice>>(lex: L, payloads: &[&[u8]]) {
327        let lex = lex.as_ref();
328        assert!(!lex.is_empty());
329        for (idx, payload) in payloads.iter().enumerate() {
330            assert_eq!(lex.get(idx as u64), Some(*payload));
331            assert_eq!(&lex[idx], *payload);
332        }
333        assert!(lex.get(6).is_none());
334        assert_eq!(lex.iter().collect::<Vec<_>>(), payloads);
335    }
336
337    #[test]
338    fn test_element_access() {
339        let payloads = vec![
340            b"aardvark".as_ref(),
341            b"cat".as_ref(),
342            b"dog".as_ref(),
343            b"gnu".as_ref(),
344            b"mouse".as_ref(),
345            b"zebra".as_ref(),
346        ];
347        let lex: PayloadVector = payloads
348            .iter()
349            .map(|&b| String::from_utf8(Vec::from(b)).unwrap())
350            .collect();
351        assert_payloads(&lex, &payloads);
352        assert_payloads(PayloadSlice::new(lex.as_ref()), &payloads);
353    }
354}