Skip to main content

bom/
bom.rs

1use thiserror::{self, Error};
2
3use std::{
4    collections::{BTreeMap, HashMap},
5    fmt,
6    fs::File,
7    hash::Hash,
8    io::{self, Read, Seek, SeekFrom},
9    ops::{Deref, Range},
10    path::Path,
11    sync::Arc,
12};
13
14use deku::{DekuContainerRead, reader::Reader};
15use memmap2::Mmap;
16
17pub use crate::model::*;
18
19pub type BOMResult<T> = Result<T, BOMEror>;
20
21#[derive(Clone)]
22pub struct ByteSource {
23    inner: Arc<ByteSourceInner>,
24}
25
26enum ByteSourceInner {
27    Owned(Box<[u8]>),
28    Mmap(Mmap),
29}
30
31impl ByteSource {
32    pub fn from_vec(bytes: Vec<u8>) -> Self {
33        Self::from_boxed_slice(bytes.into_boxed_slice())
34    }
35
36    pub fn from_boxed_slice(bytes: Box<[u8]>) -> Self {
37        Self {
38            inner: Arc::new(ByteSourceInner::Owned(bytes)),
39        }
40    }
41
42    pub fn from_mmap(mmap: Mmap) -> Self {
43        Self {
44            inner: Arc::new(ByteSourceInner::Mmap(mmap)),
45        }
46    }
47
48    pub fn from_reader<R>(mut reader: R) -> io::Result<Self>
49    where
50        R: Read,
51    {
52        let mut bytes = Vec::new();
53        reader.read_to_end(&mut bytes)?;
54        Ok(Self::from_vec(bytes))
55    }
56
57    pub fn as_slice(&self) -> &[u8] {
58        match self.inner.as_ref() {
59            ByteSourceInner::Owned(bytes) => bytes,
60            ByteSourceInner::Mmap(mmap) => mmap,
61        }
62    }
63
64    pub fn len(&self) -> usize {
65        self.as_slice().len()
66    }
67
68    pub fn is_empty(&self) -> bool {
69        self.as_slice().is_empty()
70    }
71
72    pub fn slice(&self, range: Range<usize>) -> BOMResult<ByteSlice> {
73        if range.start > range.end || range.end > self.len() {
74            return Err(BOMEror::InvalidByteRange {
75                offset: range.start,
76                len: range.end.saturating_sub(range.start),
77                source_len: self.len(),
78            });
79        }
80
81        Ok(ByteSlice {
82            source: self.clone(),
83            range,
84        })
85    }
86}
87
88impl fmt::Debug for ByteSource {
89    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
90        f.debug_struct("ByteSource")
91            .field("len", &self.len())
92            .finish_non_exhaustive()
93    }
94}
95
96#[derive(Clone)]
97pub struct ByteSlice {
98    source: ByteSource,
99    range: Range<usize>,
100}
101
102impl ByteSlice {
103    pub fn from_vec(bytes: Vec<u8>) -> Self {
104        Self::from_boxed_slice(bytes.into_boxed_slice())
105    }
106
107    pub fn from_boxed_slice(bytes: Box<[u8]>) -> Self {
108        let source = ByteSource::from_boxed_slice(bytes);
109        source
110            .slice(0..source.len())
111            .expect("full byte source range should be valid")
112    }
113
114    pub fn as_slice(&self) -> &[u8] {
115        &self.source.as_slice()[self.range.clone()]
116    }
117
118    pub fn len(&self) -> usize {
119        self.range.end - self.range.start
120    }
121
122    pub fn is_empty(&self) -> bool {
123        self.len() == 0
124    }
125
126    pub fn to_vec(&self) -> Vec<u8> {
127        self.as_slice().to_vec()
128    }
129
130    pub fn absolute_range(&self) -> Range<usize> {
131        self.range.clone()
132    }
133
134    pub fn slice(&self, range: Range<usize>) -> BOMResult<Self> {
135        if range.start > range.end || range.end > self.len() {
136            return Err(BOMEror::InvalidByteRange {
137                offset: self.range.start.saturating_add(range.start),
138                len: range.end.saturating_sub(range.start),
139                source_len: self.source.len(),
140            });
141        }
142
143        let start = self.range.start + range.start;
144        let end = self.range.start + range.end;
145        self.source.slice(start..end)
146    }
147}
148
149impl Deref for ByteSlice {
150    type Target = [u8];
151
152    fn deref(&self) -> &Self::Target {
153        self.as_slice()
154    }
155}
156
157impl fmt::Debug for ByteSlice {
158    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
159        f.debug_struct("ByteSlice")
160            .field("range", &self.range)
161            .field("len", &self.len())
162            .finish()
163    }
164}
165
166#[derive(Clone, Debug)]
167pub struct BOMBlock {
168    bytes: ByteSlice,
169    position: u64,
170}
171
172impl BOMBlock {
173    pub fn new(bytes: ByteSlice) -> Self {
174        Self { bytes, position: 0 }
175    }
176
177    pub fn as_slice(&self) -> &[u8] {
178        self.bytes.as_slice()
179    }
180
181    pub fn byte_slice(&self) -> &ByteSlice {
182        &self.bytes
183    }
184
185    pub fn len(&self) -> usize {
186        self.bytes.len()
187    }
188
189    pub fn is_empty(&self) -> bool {
190        self.bytes.is_empty()
191    }
192
193    pub fn slice_at_current(&mut self, len: usize) -> BOMResult<ByteSlice> {
194        let start = usize::try_from(self.position).map_err(|_| BOMEror::InvalidByteRange {
195            offset: usize::MAX,
196            len,
197            source_len: self.bytes.len(),
198        })?;
199        let end = start.checked_add(len).ok_or(BOMEror::InvalidByteRange {
200            offset: start,
201            len,
202            source_len: self.bytes.len(),
203        })?;
204        let slice = self.bytes.slice(start..end)?;
205        self.position = end as u64;
206        Ok(slice)
207    }
208}
209
210impl Read for BOMBlock {
211    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
212        let pos = usize::try_from(self.position).unwrap_or(usize::MAX);
213        let data = self.bytes.as_slice();
214        if pos >= data.len() {
215            return Ok(0);
216        }
217
218        let amt = buf.len().min(data.len() - pos);
219        buf[..amt].copy_from_slice(&data[pos..pos + amt]);
220        self.position += amt as u64;
221        Ok(amt)
222    }
223}
224
225impl Seek for BOMBlock {
226    fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
227        let len = self.bytes.len() as i128;
228        let current = self.position as i128;
229        let next = match pos {
230            SeekFrom::Start(offset) => offset as i128,
231            SeekFrom::End(offset) => len.checked_add(offset as i128).ok_or_else(|| {
232                io::Error::new(io::ErrorKind::InvalidInput, "seek position overflow")
233            })?,
234            SeekFrom::Current(offset) => current.checked_add(offset as i128).ok_or_else(|| {
235                io::Error::new(io::ErrorKind::InvalidInput, "seek position overflow")
236            })?,
237        };
238
239        if next < 0 {
240            return Err(io::Error::new(
241                io::ErrorKind::InvalidInput,
242                "invalid seek before start of BOM block",
243            ));
244        }
245
246        self.position = next as u64;
247        Ok(self.position)
248    }
249}
250
251pub struct BOM {
252    source: ByteSource,
253    store_header: StoreHeader,
254}
255
256impl BOM {
257    pub fn new<R>(mut reader: R) -> BOMResult<Self>
258    where
259        R: Read + Seek,
260    {
261        reader.seek(SeekFrom::Start(0))?;
262        let source = ByteSource::from_reader(reader)?;
263        Self::from_byte_source(source)
264    }
265
266    pub fn from_bytes(bytes: Vec<u8>) -> BOMResult<Self> {
267        Self::from_byte_source(ByteSource::from_vec(bytes))
268    }
269
270    pub fn from_boxed_slice(bytes: Box<[u8]>) -> BOMResult<Self> {
271        Self::from_byte_source(ByteSource::from_boxed_slice(bytes))
272    }
273
274    pub fn from_byte_source(source: ByteSource) -> BOMResult<Self> {
275        let full = source.slice(0..source.len())?;
276        let mut block = BOMBlock::new(full);
277        let (_, store_header) = StoreHeader::from_reader((&mut block, 0))?;
278        Ok(BOM {
279            source,
280            store_header,
281        })
282    }
283
284    pub fn source(&self) -> &ByteSource {
285        &self.source
286    }
287
288    fn block_with_name(&mut self, name: &[u8]) -> BOMResult<BOMBlock> {
289        let var = self
290            .store_header
291            .var_with_name(name)
292            .ok_or(BOMEror::NotFoundVar(
293                String::from_utf8_lossy(name).to_string(),
294            ))?;
295
296        self.block_at(var.index as usize)
297    }
298
299    fn block_at(&mut self, index: usize) -> BOMResult<BOMBlock> {
300        let idx = self.store_header.index_store.indexs.get(index);
301        if let Some(idx) = idx {
302            let offset = idx.offset as usize;
303            let len = idx.len as usize;
304            let end = offset.checked_add(len).ok_or(BOMEror::InvalidIndexRange {
305                index,
306                offset,
307                len,
308                source_len: self.source.len(),
309            })?;
310            if end > self.source.len() {
311                return Err(BOMEror::InvalidIndexRange {
312                    index,
313                    offset,
314                    len,
315                    source_len: self.source.len(),
316                });
317            }
318
319            return self.source.slice(offset..end).map(BOMBlock::new);
320        }
321
322        Err(BOMEror::NotFoundIndex(index))
323    }
324
325    fn tree_with_name(&mut self, name: &[u8]) -> BOMResult<Vec<TreePaths>> {
326        let mut block = self.block_with_name(name)?;
327        let (_, header) = TreeHeader::from_reader((&mut block, 0))?;
328
329        let mut tree_paths = vec![];
330        let mut tree_idx = header.index;
331        loop {
332            let path: TreePaths = self.read_block_at(tree_idx as usize)?;
333            if path.is_leaf == 0 {
334                if let Some(idx) = path.indices.first() {
335                    tree_idx = idx.val;
336                    continue;
337                }
338                break;
339            }
340
341            let next_idx = path.forward;
342            tree_paths.push(path);
343            if next_idx > 0 {
344                tree_idx = next_idx;
345            } else {
346                break;
347            }
348        }
349
350        Ok(tree_paths)
351    }
352
353    pub fn read_block_at<'a, T>(&mut self, index: usize) -> BOMResult<T>
354    where
355        T: deku::DekuReader<'a>,
356    {
357        let block = self.block_at(index)?;
358        let mut reader = Reader::new(block);
359        let data = T::from_reader_with_ctx(&mut reader, ())?;
360
361        Ok(data)
362    }
363
364    pub fn read_block_with_name<'a, T>(&mut self, name: &[u8]) -> BOMResult<T>
365    where
366        T: deku::DekuReader<'a>,
367    {
368        let var = self
369            .store_header
370            .var_with_name(name)
371            .ok_or(BOMEror::NotFoundVar(
372                String::from_utf8_lossy(name).to_string(),
373            ))?;
374        self.read_block_at(var.index as usize)
375    }
376
377    pub fn read_tree_to_btree_map<'a, K, V>(&mut self, name: &[u8]) -> BOMResult<BTreeMap<K, V>>
378    where
379        K: deku::DekuReader<'a> + Ord,
380        V: deku::DekuReader<'a>,
381    {
382        let mut map = BTreeMap::new();
383        self.parse_tree(name, |k, v| {
384            let k = K::from_reader_with_ctx(&mut Reader::new(k), ())?;
385            let v = V::from_reader_with_ctx(&mut Reader::new(v), ())?;
386            map.insert(k, v);
387
388            Ok(())
389        })?;
390
391        Ok(map)
392    }
393
394    pub fn read_tree_to_map<'a, K, V>(&mut self, name: &[u8]) -> BOMResult<HashMap<K, V>>
395    where
396        K: deku::DekuReader<'a> + Ord + Hash,
397        V: deku::DekuReader<'a>,
398    {
399        let mut map = HashMap::new();
400        self.parse_tree(name, |k, v| {
401            let k = K::from_reader_with_ctx(&mut Reader::new(k), ())?;
402            let v = V::from_reader_with_ctx(&mut Reader::new(v), ())?;
403            map.insert(k, v);
404
405            Ok(())
406        })?;
407
408        Ok(map)
409    }
410
411    pub fn parse_tree<F>(&mut self, name: &[u8], mut block: F) -> BOMResult<()>
412    where
413        F: FnMut(BOMBlock, BOMBlock) -> BOMResult<()>,
414    {
415        let paths = self.tree_with_name(name)?;
416        for path in paths {
417            for i in path.indices {
418                let k = self.block_at(i.key as usize)?;
419                let v = self.block_at(i.val as usize)?;
420                block(k, v)?;
421            }
422        }
423
424        Ok(())
425    }
426}
427
428impl BOM {
429    pub fn new_with_file<P>(file_path: P) -> BOMResult<Self>
430    where
431        P: AsRef<Path>,
432    {
433        let file = File::options().read(true).open(file_path)?;
434        let mmap = unsafe { Mmap::map(&file) }?;
435        Self::from_byte_source(ByteSource::from_mmap(mmap))
436    }
437}
438
439#[derive(Error, Debug)]
440pub enum BOMEror {
441    #[error("Read failed {0}")]
442    ReadIO(#[from] io::Error),
443    #[error("Parse struct failed {0}")]
444    ParseStruct(#[from] deku::DekuError),
445    #[error("Cann't not found for index {0}")]
446    NotFoundIndex(usize),
447    #[error("Invalid BOM index range {index}: offset {offset}, len {len}, source len {source_len}")]
448    InvalidIndexRange {
449        index: usize,
450        offset: usize,
451        len: usize,
452        source_len: usize,
453    },
454    #[error("Invalid byte range: offset {offset}, len {len}, source len {source_len}")]
455    InvalidByteRange {
456        offset: usize,
457        len: usize,
458        source_len: usize,
459    },
460    #[error("Cann't not found for name {0}")]
461    NotFoundVar(String),
462    #[error("Cann't not found for tree {0}")]
463    NotFoundTree(String),
464}
465
466#[cfg(test)]
467mod tests {
468    use std::io::Read;
469
470    use super::{BOM, BOMEror};
471
472    fn push_be_u32(bytes: &mut Vec<u8>, value: u32) {
473        bytes.extend_from_slice(&value.to_be_bytes());
474    }
475
476    fn bom_bytes_with_index(offset: u32, len: u32, payload: &[u8]) -> Vec<u8> {
477        let index_offset = 32u32;
478        let var_offset = 44u32;
479        let mut bytes = Vec::new();
480        bytes.extend_from_slice(b"BOMStore");
481        push_be_u32(&mut bytes, 1); // version
482        push_be_u32(&mut bytes, 1); // block_count
483        push_be_u32(&mut bytes, index_offset);
484        push_be_u32(&mut bytes, 12); // index_len: count + one entry
485        push_be_u32(&mut bytes, var_offset);
486        push_be_u32(&mut bytes, 4); // var_len: empty variable store count
487        push_be_u32(&mut bytes, 1); // index count
488        push_be_u32(&mut bytes, offset);
489        push_be_u32(&mut bytes, len);
490        push_be_u32(&mut bytes, 0); // variable count
491
492        let offset = offset as usize;
493        if bytes.len() < offset {
494            bytes.resize(offset, 0);
495        }
496        bytes.extend_from_slice(payload);
497        bytes
498    }
499
500    #[test]
501    fn block_at_returns_range_view_bytes() {
502        let bytes = bom_bytes_with_index(64, 5, b"hello");
503        let mut bom = BOM::from_bytes(bytes).expect("synthetic BOM should parse");
504
505        let mut block = bom.block_at(0).expect("block should exist");
506        assert_eq!(block.as_slice(), b"hello");
507        assert_eq!(block.byte_slice().absolute_range(), 64..69);
508
509        let mut read = Vec::new();
510        block.read_to_end(&mut read).expect("block should read");
511        assert_eq!(read, b"hello");
512    }
513
514    #[test]
515    fn block_at_rejects_out_of_range_index() {
516        let bytes = bom_bytes_with_index(100, 10, &[]);
517        let mut bom = BOM::from_bytes(bytes).expect("synthetic BOM should parse");
518
519        let err = bom
520            .block_at(0)
521            .expect_err("invalid index range should fail");
522        assert!(matches!(
523            err,
524            BOMEror::InvalidIndexRange {
525                index: 0,
526                offset: 100,
527                len: 10,
528                ..
529            }
530        ));
531    }
532}