filebuffer/lib.rs
1// Filebuffer -- Fast and simple file reading
2// Copyright 2016 Ruud van Asseldonk
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// A copy of the License has been included in the root of the repository.
7
8//! Filebuffer, a library for fast and simple file reading.
9//!
10//! # Examples
11//!
12//! Map a file into memory and access it as a slice of bytes. This is simple and will generally
13//! outperform `Read::read_to_end()`.
14//!
15//! ```
16//! use filebuffer::FileBuffer;
17//! let fbuffer = FileBuffer::open("src/lib.rs").unwrap();
18//! assert_eq!(&fbuffer[3..45], &b"Filebuffer -- Fast and simple file reading"[..]);
19//! ```
20
21#![warn(missing_docs)]
22
23use std::cmp;
24use std::io;
25use std::fs;
26use std::ops::Deref;
27use std::path::Path;
28use std::ptr;
29use std::slice;
30
31#[cfg(unix)]
32mod unix;
33
34#[cfg(windows)]
35mod windows;
36
37#[cfg(unix)]
38use unix::{PlatformData, get_page_size, map_file, unmap_file, prefetch};
39
40#[cfg(all(unix))]
41use unix::get_resident;
42
43#[cfg(windows)]
44use windows::{PlatformData, get_resident, get_page_size, map_file, unmap_file, prefetch};
45
46/// A memory-mapped file.
47///
48/// # Safety
49///
50/// **On Unix-ish platforms, external modifications to the file made after the file buffer was
51/// opened can show up in this file buffer.** In particular, if a file is truncated after opening,
52/// accessing the removed part causes undefined behavior. On Windows it is possible to prevent this
53/// by opening the file in exclusive mode, but that functionality is not available in stable Rust
54/// currently. (Filebuffer will be updated after stabilization.)
55///
56/// It is recommended to ensure that other applications do not write to the file when it is mapped,
57/// possibly by marking the file read-only. (Though even this is no guarantee.)
58#[derive(Debug)]
59pub struct FileBuffer {
60 page_size: usize,
61 buffer: *const u8,
62 length: usize,
63
64 #[allow(dead_code)] // This field is not dead, it might have an effectful destructor.
65 platform_data: PlatformData,
66}
67
68/// Rounds `size` up to the nearest multiple of `power_of_two`.
69fn round_up_to(size: usize, power_of_two: usize) -> usize {
70 (size + (power_of_two - 1)) & !(power_of_two - 1)
71}
72
73#[test]
74fn verify_round_up_to() {
75 assert_eq!(1024, round_up_to(23, 1024));
76 assert_eq!(1024, round_up_to(1024, 1024));
77 assert_eq!(2048, round_up_to(1025, 1024));
78}
79
80/// Rounds `size` down to the nearest multiple of `power_of_two`.
81fn round_down_to(size: usize, power_of_two: usize) -> usize {
82 size & !(power_of_two - 1)
83}
84
85#[test]
86fn verify_round_down_to() {
87 assert_eq!(0, round_down_to(23, 1024));
88 assert_eq!(1024, round_down_to(1024, 1024));
89 assert_eq!(1024, round_down_to(1025, 1024));
90}
91
92impl FileBuffer {
93 /// Maps the file at `path` into memory.
94 pub fn open<P: AsRef<Path>>(path: P) -> io::Result<FileBuffer> {
95 // Open the `fs::File` so we get all of std's error handling for free, then use it to
96 // extract the file descriptor. The file is closed again when `map_file` returns on
97 // Unix-ish platforms, but `mmap` only requires the descriptor to be open for the `mmap`
98 // call, so this is fine. On Windows, the file must be kept open for the lifetime of the
99 // mapping, so `map_file` moves the file into the platform data.
100 let mut open_opts = fs::OpenOptions::new();
101 open_opts.read(true);
102
103 // TODO: On Windows, set `share_mode()` to read-only. This requires the
104 // `open_options_ext` feature that is currently unstable, but it is
105 // required to ensure that a different process does not suddenly modify
106 // the contents of the file. See also Rust issue 27720.
107
108 let file = open_opts.open(path)?;
109 let (buffer, length, platform_data) = map_file(file)?;
110 let fbuffer = FileBuffer {
111 page_size: get_page_size(),
112 buffer: buffer,
113 length: length,
114 platform_data: platform_data
115 };
116 Ok(fbuffer)
117 }
118
119 /// Returns the number of bytes resident in physical memory, starting from `offset`.
120 ///
121 /// The slice `[offset..offset + resident_len]` can be accessed without causing page faults or
122 /// disk access. Note that this is only a snapshot, and the kernel might decide to evict pages
123 /// or make them resident at any time.
124 ///
125 /// The returned resident length is at most `length`.
126 ///
127 /// # Panics
128 ///
129 /// Panics if the specified range lies outside of the buffer.
130 ///
131 /// # Remarks
132 ///
133 /// Windows does not expose a mechanism to query which pages are resident in physical
134 /// memory. Therefore this function optimistically claims that the entire range is resident
135 /// on Windows.
136 pub fn resident_len(&self, offset: usize, length: usize) -> usize {
137 // The specified offset and length must lie within the buffer.
138 assert!(offset + length <= self.length);
139
140 // This is a no-op for empty files.
141 if self.buffer == ptr::null() { return 0; }
142
143 let aligned_offset = round_down_to(offset, self.page_size);
144 let aligned_length = round_up_to(length + (offset - aligned_offset), self.page_size);
145 let num_pages = aligned_length / self.page_size;
146
147 // There is a tradeoff here: to store residency information, we need an array of booleans.
148 // The requested range can potentially be very large and it is only known at runtime. We
149 // could allocate a vector here, but that requires a heap allocation just to get residency
150 // information (which might in turn cause a page fault). Instead, check at most 32 pages at
151 // once. This means more syscalls for large ranges, but it saves us the heap allocation,
152 // and for ranges up to 32 pages (128 KiB typically) there is only one syscall.
153 let mut residency = [false; 32];
154 let mut pages_checked = 0;
155 let mut pages_resident = 0;
156
157 while pages_checked < num_pages {
158 let pages_to_check = cmp::min(32, num_pages - pages_checked);
159 let check_offset = (aligned_offset + pages_checked * self.page_size) as isize;
160 let check_buffer = unsafe { self.buffer.offset(check_offset) };
161 let check_length = pages_to_check * self.page_size;
162 get_resident(check_buffer, check_length, &mut residency);
163
164 // Count the number of resident pages.
165 match residency[..pages_to_check].iter().position(|resident| !resident) {
166 Some(non_resident) => {
167 // The index of the non-resident page is the number of resident pages.
168 pages_resident += non_resident;
169 break;
170 }
171 None => {
172 pages_resident += pages_to_check;
173 pages_checked += pages_to_check;
174 }
175 }
176 }
177
178 let resident_length = pages_resident * self.page_size + aligned_offset - offset;
179
180 // Never return more than the requested length. The resident length might be larger than
181 // the length of the buffer, because it is rounded up to the page size.
182 cmp::min(length, resident_length)
183 }
184
185 /// Returns the system page size.
186 ///
187 /// When the kernel makes the file resident in physical memory, it does so with page
188 /// granularity. (In practice this happens in larger chunks, but still in multiples of
189 /// the page size.) Therefore, when processing the file in chunks, this is a good chunk
190 /// length.
191 pub fn chunk_len_hint(&self) -> usize {
192 self.page_size
193 }
194
195 /// Advises the kernel to make a slice of the file resident in physical memory.
196 ///
197 /// This method does not block, meaning that when the function returns, the slice is not
198 /// necessarily resident. After this function returns, the kernel may read the requested slice
199 /// from disk and make it resident. Note that this is only an advice, the kernel need not honor
200 /// it.
201 ///
202 /// To check whether the slice is resident at a later time, use `resident_len()`.
203 ///
204 /// # Panics
205 ///
206 /// Panics if the specified range lies outside of the buffer.
207 pub fn prefetch(&self, offset: usize, length: usize) {
208 // TODO: This function should use `collections::range::RangeArgument` once stabilized.
209 // The specified offset and length must lie within the buffer.
210 assert!(offset + length <= self.length);
211
212 // This is a no-op for empty files.
213 if self.buffer == ptr::null() { return; }
214
215 let aligned_offset = round_down_to(offset, self.page_size);
216 let aligned_length = round_up_to(length + (offset - aligned_offset), self.page_size);
217
218 let buffer = unsafe { self.buffer.offset(aligned_offset as isize) };
219 prefetch(buffer, aligned_length);
220 }
221
222 /// Leaks the file buffer as a byte slice.
223 ///
224 /// This prevents the buffer from being unmapped, keeping the file mapped until the program
225 /// ends. This is not as bad as it sounds, because the kernel is free to evict pages from
226 /// physical memory in case of memory pressure. Because the file is mapped read-only, it can
227 /// always be read from disk again.
228 ///
229 /// If the file buffer is going to be open for the entire duration of the program anyway, this
230 /// method can avoid some lifetime issues. Still, it is good practice to close the file buffer
231 /// if possible. This method should be a last resort.
232 pub fn leak(mut self) -> &'static [u8] {
233 let buffer = if self.buffer == ptr::null() {
234 &[]
235 } else {
236 unsafe { slice::from_raw_parts(self.buffer, self.length) }
237 };
238
239 // Prevent `drop()` from freeing the buffer.
240 self.buffer = ptr::null();
241 self.length = 0;
242
243 buffer
244 }
245}
246
247// There is no possibility of data races when passing `&FileBuffer` across threads,
248// because the buffer is read-only. `&FileBuffer` has no interior mutability.
249unsafe impl Sync for FileBuffer {}
250
251// It is safe to move a `FileBuffer` into a different thread.
252unsafe impl Send for FileBuffer {}
253
254impl Drop for FileBuffer {
255 fn drop(&mut self) {
256 if self.buffer != ptr::null() { unmap_file(self.buffer, self.length); }
257 }
258}
259
260impl Deref for FileBuffer {
261 type Target = [u8];
262
263 fn deref(&self) -> &[u8] {
264 if self.buffer == ptr::null() {
265 &[]
266 } else {
267 unsafe { slice::from_raw_parts(self.buffer, self.length) }
268 }
269 }
270}
271
272impl AsRef<[u8]> for FileBuffer {
273 fn as_ref(&self) -> &[u8] {
274 self.deref()
275 }
276}
277
278#[test]
279fn open_file() {
280 let fbuffer = FileBuffer::open("src/lib.rs");
281 assert!(fbuffer.is_ok());
282}
283
284#[test]
285fn make_resident() {
286 let fbuffer = FileBuffer::open("src/lib.rs").unwrap();
287
288 // Touch the first page to make it resident.
289 assert_eq!(&fbuffer[3..13], &b"Filebuffer"[..]);
290
291 // Now at least that part should be resident.
292 assert_eq!(fbuffer.resident_len(3, 10), 10);
293}
294
295#[test]
296fn prefetch_is_not_harmful() {
297 let fbuffer = FileBuffer::open("src/lib.rs").unwrap();
298
299 // It is impossible to test that this actually works without root access to instruct the kernel
300 // to drop its caches, but at least we can verify that calling `prefetch` is not harmful.
301 fbuffer.prefetch(0, fbuffer.len());
302
303 // Reading from the file should still work as normal.
304 assert_eq!(&fbuffer[3..13], &b"Filebuffer"[..]);
305}
306
307#[test]
308fn drop_after_leak() {
309 let mut bytes = &[0u8][..];
310 assert_eq!(bytes[0], 0);
311 {
312 let fbuffer = FileBuffer::open("src/lib.rs").unwrap();
313 bytes = fbuffer.leak();
314 }
315 assert_eq!(&bytes[3..13], &b"Filebuffer"[..]);
316}
317
318#[test]
319fn fbuffer_can_be_moved_into_thread() {
320 use std::thread;
321
322 let fbuffer = FileBuffer::open("src/lib.rs").unwrap();
323 thread::spawn(move || {
324 assert_eq!(&fbuffer[3..13], &b"Filebuffer"[..]);
325 });
326}
327
328#[test]
329fn fbuffer_can_be_shared_among_threads() {
330 use std::sync;
331 use std::thread;
332
333 let fbuffer = FileBuffer::open("src/lib.rs").unwrap();
334 let buffer1 = sync::Arc::new(fbuffer);
335 let buffer2 = buffer1.clone();
336 thread::spawn(move || {
337 assert_eq!(&buffer2[3..13], &b"Filebuffer"[..]);
338 });
339 assert_eq!(&buffer1[17..45], &b"Fast and simple file reading"[..]);
340}
341
342#[test]
343fn open_empty_file_is_fine() {
344 FileBuffer::open("src/empty_file_for_testing.rs").unwrap();
345}
346
347#[test]
348fn empty_file_prefetch_is_fine() {
349 let fbuffer = FileBuffer::open("src/empty_file_for_testing.rs").unwrap();
350 fbuffer.prefetch(0, 0);
351}
352
353#[test]
354fn empty_file_deref_is_fine() {
355 let fbuffer = FileBuffer::open("src/empty_file_for_testing.rs").unwrap();
356 assert_eq!(fbuffer.iter().any(|_| true), false);
357}
358
359#[test]
360fn empty_file_has_zero_resident_len() {
361 let fbuffer = FileBuffer::open("src/empty_file_for_testing.rs").unwrap();
362 assert_eq!(fbuffer.resident_len(0, 0), 0);
363}
364
365#[test]
366fn page_size_at_least_4096() {
367 // There is no reason why the page size cannot be smaller, it is just that in practice there
368 // is no platform with a smaller page size, so this tests that `get_page_size()` returns
369 // a plausible value.
370 assert!(get_page_size() >= 4096);
371}