1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
use std::{cell::RefCell, collections::HashMap, convert::TryInto, ops::Range};

use crate::chunked_read_buffer_manager::{ChunkedReadBufferManager, RangeLocation, RangeSourcing};

use elsa::FrozenVec;

use crate::{FileAndPathHelperResult, FileContents};

const CHUNK_SIZE: u64 = 32 * 1024;

pub trait FileByteSource {
    /// Read `size` bytes at offset `offset` and append them to `buffer`.
    /// If successful, `buffer` must have had its len increased exactly by `size`,
    /// otherwise the caller may panic.
    fn read_bytes_into(
        &self,
        buffer: &mut Vec<u8>,
        offset: u64,
        size: usize,
    ) -> FileAndPathHelperResult<()>;
}

pub struct FileContentsWithChunkedCaching<S: FileByteSource> {
    source: S,
    file_len: u64,
    buffer_manager: RefCell<ChunkedReadBufferManager<CHUNK_SIZE>>,
    string_cache: RefCell<HashMap<(u64, u8), RangeLocation>>,
    buffers: FrozenVec<Box<[u8]>>,
}

impl<S: FileByteSource> FileContentsWithChunkedCaching<S> {
    pub fn new(file_len: u64, source: S) -> Self {
        FileContentsWithChunkedCaching {
            source,
            buffers: FrozenVec::new(),
            file_len,
            buffer_manager: RefCell::new(ChunkedReadBufferManager::new_with_size(file_len)),
            string_cache: RefCell::new(HashMap::new()),
        }
    }

    #[inline]
    fn slice_from_location(&self, location: &RangeLocation) -> &[u8] {
        let buffer = &self.buffers[location.buffer_handle];
        &buffer[location.offset_from_start..][..location.size]
    }

    /// Must be called with a valid, non-empty range which does not exceed file_len.
    #[inline]
    fn get_range_location(&self, range: Range<u64>) -> FileAndPathHelperResult<RangeLocation> {
        let mut buffer_manager = self.buffer_manager.borrow_mut();
        let read_range = match buffer_manager.determine_range_sourcing(range.clone()) {
            RangeSourcing::InExistingBuffer(l) => return Ok(l),
            RangeSourcing::NeedToReadNewBuffer(read_range) => read_range,
        };
        assert!(read_range.start <= read_range.end);

        // Read the bytes from the source.
        let read_len: usize = (read_range.end - read_range.start).try_into()?;
        let mut buffer = Vec::new();
        self.source
            .read_bytes_into(&mut buffer, read_range.start, read_len)?;
        assert!(buffer.len() == read_len);

        let buffer_handle = self.buffers.len();
        self.buffers.push(buffer.into_boxed_slice());
        buffer_manager.insert_buffer_range(read_range.clone(), buffer_handle);

        Ok(RangeLocation {
            buffer_handle,
            offset_from_start: (range.start - read_range.start) as usize,
            size: (range.end - range.start) as usize,
        })
    }
}

impl<S: FileByteSource> FileContents for FileContentsWithChunkedCaching<S> {
    #[inline]
    fn len(&self) -> u64 {
        self.file_len
    }

    #[inline]
    fn read_bytes_at(&self, offset: u64, size: u64) -> FileAndPathHelperResult<&[u8]> {
        if size == 0 {
            return Ok(&[]);
        }

        let start = offset;
        let end = offset.checked_add(size).ok_or_else(|| {
            std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "read_bytes_at with offset + size overflowing u64",
            )
        })?;
        if end > self.file_len {
            return Err(Box::new(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "read_bytes_at range out-of-bounds",
            )));
        }
        let location = self.get_range_location(start..end)?;
        Ok(self.slice_from_location(&location))
    }

    #[inline]
    fn read_bytes_at_until(
        &self,
        range: Range<u64>,
        delimiter: u8,
    ) -> FileAndPathHelperResult<&[u8]> {
        const MAX_LENGTH_INCLUDING_DELIMITER: u64 = 4096;

        if range.end < range.start {
            return Err(Box::new(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "read_bytes_at_until called with range.end < range.start",
            )));
        }
        if range.end > self.file_len {
            return Err(Box::new(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "read_bytes_at_until range out-of-bounds",
            )));
        }

        let mut string_cache = self.string_cache.borrow_mut();
        if let Some(location) = string_cache.get(&(range.start, delimiter)) {
            return Ok(self.slice_from_location(location));
        }

        let max_len = (range.end - range.start).min(MAX_LENGTH_INCLUDING_DELIMITER);
        let mut location = self.get_range_location(range.start..(range.start + max_len))?;
        let bytes = self.slice_from_location(&location);

        let string_len = match memchr::memchr(delimiter, bytes) {
            Some(len) => len,
            None => {
                return Err(Box::new(std::io::Error::new(
                    std::io::ErrorKind::InvalidInput,
                    "Could not find delimiter",
                )));
            }
        };

        location.size = string_len;
        string_cache.insert((range.start, delimiter), location);
        Ok(&bytes[..string_len])
    }

    fn read_bytes_into(
        &self,
        buffer: &mut Vec<u8>,
        offset: u64,
        size: usize,
    ) -> FileAndPathHelperResult<()> {
        self.source.read_bytes_into(buffer, offset, size)
    }
}