fileslice/
lib.rs

1/*! Slices of files
2
3[`FileSlice`] is to `File` what [`Bytes`](https://docs.rs/bytes/) is to
4`Vec<u8>`.  Advantages over `File`:
5
6* You can slice it, reducing the scope to a range within the original file
7* Cloning is cheap (atomic addition; no syscall)
8* Seeking is very cheap (normal addition; no syscall)
9* Clones can't affect each other at all (the fd's real cursor is never
10  used).
11
12Once created, a `FileSlice` never changes length, even if the underlying file
13does.  For example, if another process appends some data to the file, you need
14to call [`FileSlice::expand`] on your slice in order to add the new data.
15
16## Optional features
17
18Optional integrations for crates which naturally benefit from file slicing:
19
20* `tar`: Adds a [`slice_tarball`] helper method for splitting up a
21  `tar::Archive` into a bunch of `FileSlice`s.
22* `parquet`: Adds a [`ChunkReader`][parquet::file::reader::ChunkReader]
23  impl for [`FileSlice`].  A parquet file contains many pages, and the decoder
24  needs to interleave reads from these pages.  The `ChunkReader` impl for `File`
25  accomplishes this by making many clones of the fd.  Using `FileSlice` instead
26  lets you open roughly 7x as many parquet files before you hit your fd limit.
27
28*/
29
30use std::fs::File;
31use std::io::{Read, Seek, SeekFrom};
32use std::ops::{Bound, RangeBounds};
33use std::sync::Arc;
34
35/// A slice of a file
36///
37/// Behaves like a regular file, but emulated in userspace using the
38/// `pread` API.
39#[derive(Clone, Debug)]
40pub struct FileSlice {
41    file: Arc<File>,
42    // Can go beyond `end` but must not be before `start`
43    cursor: u64,
44    start: u64,
45    end: u64,
46}
47
48impl FileSlice {
49    /// Create a new slice covering the whole file
50    pub fn new(file: File) -> FileSlice {
51        let end = file.metadata().unwrap().len();
52        FileSlice {
53            file: Arc::new(file),
54            cursor: 0,
55            start: 0,
56            end,
57        }
58    }
59
60    /// Take a sub-slice of this file
61    pub fn slice<T>(&self, range: T) -> FileSlice
62    where
63        T: RangeBounds<u64>,
64    {
65        // The parameters are interpreted relative to `self`
66        let start = match range.start_bound() {
67            Bound::Included(x) => self.start + x,
68            Bound::Excluded(x) => self.start + x + 1,
69            Bound::Unbounded => self.start,
70        };
71        let end = match range.end_bound() {
72            Bound::Included(x) => self.start + x + 1,
73            Bound::Excluded(x) => self.start + x,
74            Bound::Unbounded => self.end,
75        };
76        let end = end
77            .min(self.end) // Not allowed to expand beyond `self`
78            .max(start); // We require that `start <= end`
79        FileSlice {
80            file: self.file.clone(),
81            cursor: start,
82            start,
83            end,
84        }
85    }
86}
87
88impl FileSlice {
89    /// The position at which this slice begins, as a byte offset into the
90    /// underlying file
91    pub fn start_pos(&self) -> u64 {
92        self.start
93    }
94
95    /// The position at which this slice ends, as a byte offset into the
96    /// underlying file
97    pub fn end_pos(&self) -> u64 {
98        self.end
99    }
100
101    /// The next byte to be read, as an offset into the underlying file
102    pub fn cursor_pos(&self) -> u64 {
103        self.cursor
104    }
105
106    pub fn is_empty(&self) -> bool {
107        self.start == self.end
108    }
109
110    pub fn len(&self) -> usize {
111        (self.end - self.start) as usize
112    }
113
114    pub fn bytes_remaining(&self) -> usize {
115        (self.end - self.cursor) as usize
116    }
117}
118
119impl Read for FileSlice {
120    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
121        let remaining = (self.end - self.cursor) as usize;
122        let buf = if buf.len() > remaining {
123            &mut buf[..remaining]
124        } else {
125            buf
126        };
127
128        let x;
129        #[cfg(target_family = "unix")]
130        {
131            use std::os::unix::fs::FileExt;
132            x = self.file.read_at(buf, self.cursor)?;
133        }
134        #[cfg(target_family = "windows")]
135        {
136            use std::os::windows::fs::FileExt;
137            x = self.file.seek_read(buf, self.cursor)?;
138        }
139        #[cfg(target_family = "wasm")]
140        {
141            use std::os::wasi::fs::FileExt;
142            x = self.file.read_at(buf, self.cursor)?;
143        }
144
145        self.cursor += x as u64;
146        Ok(x)
147    }
148}
149
150impl Seek for FileSlice {
151    fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
152        let cursor = match pos {
153            SeekFrom::Current(x) => i128::from(self.cursor) + i128::from(x),
154            SeekFrom::Start(x) => i128::from(self.start + x),
155            SeekFrom::End(x) => i128::from(self.end) + i128::from(x),
156        };
157        let cursor = match u64::try_from(cursor) {
158            Ok(x) if x >= self.start => x,
159            _ => {
160                return Err(std::io::Error::new(
161                    std::io::ErrorKind::Other,
162                    "Out of bounds",
163                ))
164            }
165        };
166        self.cursor = cursor;
167        self.stream_position()
168    }
169
170    fn stream_position(&mut self) -> std::io::Result<u64> {
171        Ok(self.cursor - self.start)
172    }
173}
174
175impl FileSlice {
176    /// Expand the slice to cover the whole file
177    ///
178    /// This queries the underlying file for its current length, which may have
179    /// changed since this `FileSlice` was created.  Counter-intuitively, this
180    /// means that calling this method _could_ in theory cause the length of the
181    /// `FileSlice` to reduce (if the underlying file has been truncated).
182    pub fn expand(&mut self) {
183        self.start = 0;
184        self.end = self.file.metadata().unwrap().len();
185    }
186
187    /// Try to get back the inner `File`
188    ///
189    /// This only works if this `FileSlice` has no living clones.  If there are
190    /// other `FileSlices` using the same `File`, this method will return the
191    /// original `FileSlice` unmodified.
192    pub fn try_unwrap(self) -> Result<File, FileSlice> {
193        Arc::try_unwrap(self.file).map_err(|file| FileSlice {
194            file,
195            cursor: self.cursor,
196            start: self.start,
197            end: self.end,
198        })
199    }
200}
201
202#[cfg(feature = "parquet")]
203mod parquet_impls {
204    use super::*;
205    use bytes::Bytes;
206    use parquet::file::reader::{ChunkReader, Length};
207
208    impl Length for FileSlice {
209        fn len(&self) -> u64 {
210            self.end - self.start
211        }
212    }
213
214    impl ChunkReader for FileSlice {
215        type T = FileSlice;
216
217        fn get_read(&self, start: u64) -> parquet::errors::Result<FileSlice> {
218            Ok(self.slice(start..self.end))
219        }
220
221        fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
222            let mut buf = vec![0; length];
223            self.slice(start..(start + length as u64))
224                .read_exact(&mut buf)?;
225            Ok(buf.into())
226        }
227    }
228}
229
230#[cfg(feature = "tar")]
231pub fn slice_tarball(
232    mut archive: tar::Archive<File>,
233) -> std::io::Result<impl Iterator<Item = (tar::Header, FileSlice)>> {
234    let headers = archive
235        .entries_with_seek()?
236        .map(move |entry| {
237            let entry = entry.unwrap();
238            let start = entry.raw_file_position();
239            let end = start + entry.size();
240            (entry.header().clone(), start, end)
241        })
242        .collect::<Vec<_>>();
243    let file = FileSlice::new(archive.into_inner());
244    Ok(headers
245        .into_iter()
246        .map(move |(header, start, end)| (header, file.slice(start..end))))
247}