buf_ref_reader/
lib.rs

1/*!
2Faster, growable buffering reader for when there's little to no need to modify data, nor to keep it alive past next read.
3
4`std::io::BufReader` works by copying data from its internal buffer into user-provided `Vec`/`String`,
5or, in case of `.lines()`, by emitting new heap-allocated `String` for each iteration.
6While convenient and versatile, this is not the fastest approach.
7
8Instead, `BufRefReader` references its internal buffer with each read, returning `&[u8]`.
9Lack of extra allocations yields better read performance in situations where most (if not all) of read data:
10
11- requires no modifications,
12- is never used outside of a loop body and does not need to be duplicated into the heap for future use.
13
14While being more performant, this approach also severely limits applicability of this reader:
15
16- it does not (and cannot) implement `BufRead` and cannot be used as a direct replacement for `BufReader`;
17- returned values are only valid between calls to reading functions (i.e. they cannot outlive even a single loop cycle), and Rust's borrow checker will prevent you from using stale references;
18- consequently, `BufRefReader` cannot be turned into an `Iterator` (here's an easy way to think about it: what would `Iterator::collect()` return?);
19- returned references are immutable;
20- obviously, there's also nothing that can return `String`s or `&str`s for you.
21
22## Choice a of buffer
23
24Use [`MmapBuffer`](struct.MmapBuffer.html) unless:
25
26- [slice-deque](https://github.com/gnzlbg/slice_deque) is not available for your platform (e.g. no support for `mmap`),
27- you need very small buffers (smaller than 1 memory page),
28- you're about to create a lot of buffers in a short period of time ([`new()`](trait.Buffer.html#tymethod.new) is relatively expensive),
29- you're expecting buffer to grow a lot (consider, if possible, preallocating larger buffers through [`BufRefReaderBuilder.capacity`](struct.BufRefReaderBuilder.html#method.capacity)),
30- you have some very special concerns re: memory maps and malloc bypass (special allocators, possible kernel inefficiency due to large amount of mapped memory regions etc.).
31
32## Examples
33
34Read data word by word:
35
36```
37use buf_ref_reader::*;
38
39fn read<B: Buffer>() -> Result<(), Error>
40where
41	Error: From<B::Error>,
42	// add this if you plan to `unwrap()` errors returned by `read()` et al.
43	//B::Error: std::fmt::Debug,
44{
45	// &[u8] implements Read, hence we use it as our data source for this example
46	let data = b"lorem ipsum dolor sit amet";
47	let mut r = BufRefReaderBuilder::new(&data[..])
48		.capacity(4)
49		.build::<B>()?;
50
51	assert_eq!(r.read_until(b' ')?, Some(&b"lorem "[..]));
52	assert_eq!(r.read_until(b' ')?, Some(&b"ipsum "[..]));
53	assert_eq!(r.read_until(b' ')?, Some(&b"dolor "[..]));
54	assert_eq!(r.read_until(b' ')?, Some(&b"sit "[..]));
55	assert_eq!(r.read_until(b' ')?, Some(&b"amet"[..]));
56	assert_eq!(r.read_until(b' ')?, None); // EOF
57	assert_eq!(r.read_until(b' ')?, None);
58
59	Ok(())
60}
61
62fn main() {
63	read::<VecBuffer>().unwrap();
64	read::<MmapBuffer>().unwrap();
65}
66```
67*/
68
69#![warn(missing_docs)]
70
71use quick_error::quick_error;
72
73use std::io::{self, Read};
74use memchr::memchr;
75
76mod buffer;
77pub use buffer::{
78	Buffer,
79	VecBuffer,
80	MmapBuffer,
81};
82use slice_deque::AllocError;
83
84use std::convert::From;
85
86/**
87Buffering reader.
88
89See [module-level docs](index.html) for examples.
90*/
91pub struct BufRefReader<R, B> {
92	src: R,
93	buf: B,
94}
95
96/**
97Builder for [`BufRefReader`](struct.BufRefReader.html).
98
99See [module-level docs](index.html) for examples.
100*/
101pub struct BufRefReaderBuilder<R> {
102	src: R,
103	bufsize: usize,
104}
105impl<R: Read> BufRefReaderBuilder<R> {
106	/// Creates new builder with given reader and default options.
107	pub fn new(src: R) -> Self {
108		BufRefReaderBuilder {
109			src,
110			bufsize: 8192,
111		}
112	}
113
114	/// Set initial buffer capacity.
115	pub fn capacity(mut self, bufsize: usize) -> Self {
116		self.bufsize = bufsize;
117		self
118	}
119
120	/// Create actual reader.
121	pub fn build<B: Buffer>(self) -> Result<BufRefReader<R, B>, B::Error> {
122		Ok(BufRefReader {
123			src: self.src,
124			buf: B::new(self.bufsize)?,
125		})
126	}
127}
128
129quick_error! {
130	/// Error type that reading functions might emit
131	#[derive(Debug)]
132	pub enum Error {
133		/// Error reading from actual reader
134		IO(err: io::Error) { from() }
135		/// Indicates failure to create/grow buffer
136		Buf(err: AllocError) { from() }
137	}
138}
139impl From<()> for Error {
140	// VecBuffer never emits errors, it only panics
141	fn from(_: ()) -> Self {
142		unimplemented!()
143	}
144}
145
146impl<R: Read, B: Buffer> BufRefReader<R, B>
147where Error: From<B::Error>
148{
149	/// Creates buffered reader with default options. Look for [`BufRefReaderBuilder`](struct.BufRefReaderBuilder.html) for tweaks.
150	pub fn new(src: R) -> Result<BufRefReader<R, B>, B::Error> {
151		BufRefReaderBuilder::new(src)
152			.build()
153	}
154
155	// returns Some(where appended data starts within the filled part of the buffer),
156	// or None for EOF
157	#[inline]
158	fn fill(&mut self) -> Result<Option<usize>, Error> {
159		self.buf.enlarge()?;
160
161		let old_len = self.buf.len();
162
163		match self.src.read(self.buf.appendable())? {
164			0 => Ok(None), // EOF
165			n => {
166				self.buf.grow(n);
167				Ok(Some(old_len))
168			}
169		}
170	}
171
172	/**
173	Returns requested amount of bytes, or less if EOF prevents reader from fulfilling the request.
174
175	Returns:
176
177	- `Ok(Some(data))` with, well, data,
178	- `Ok(None)` if no more data is available,
179	- `Err(err)`: see `std::io::Read::read()`
180	*/
181	#[inline]
182	pub fn read(&mut self, n: usize) -> Result<Option<&[u8]>, Error> {
183		while n > self.buf.len() {
184			// fill and expand buffer until either:
185			// - buffer starts holding the requested amount of data
186			// - EOF is reached
187			if self.fill()?.is_none() { break };
188		}
189		if self.buf.len() == 0 {
190			// reading past EOF
191			Ok(None)
192		} else {
193			let output = self.buf.consume(n);
194			Ok(Some(output))
195		}
196	}
197
198	/**
199	Returns bytes up until and including `delim`, or until EOF mark. If no content is available, returns `None`.
200
201	Returns:
202
203	- `Ok(Some(data))` with, well, data,
204	- `Ok(None)` if no more data is available,
205	- `Err(err)`: see `std::io::Read::read()`
206	*/
207	#[inline]
208	pub fn read_until(&mut self, delim: u8) -> Result<Option<&[u8]>, Error> {
209		let mut len = None;
210		// position within filled part of the buffer,
211		// from which to continue search for character
212		let mut pos = 0;
213		loop {
214			// fill and expand buffer until either:
215			// - `delim` appears in the buffer
216			// - EOF is reached
217			if let Some(n) = memchr(delim, &self.buf.filled()[pos..]) {
218				len = Some(pos+n);
219				break;
220			}
221			pos = match self.fill()? {
222				None => break, // EOF
223				Some(pos) => pos,
224			};
225		}
226
227		match len {
228			None => { // EOF
229				if self.buf.len() == 0 {
230					Ok(None)
231				} else {
232					let output = self.buf.consume(self.buf.len());
233					Ok(Some(output))
234				}
235			},
236			Some(len) => {
237				let len = len + 1; // also include matching delimiter
238				let output = self.buf.consume(len);
239				Ok(Some(output))
240			},
241		}
242	}
243}
244
245#[cfg(test)]
246static WORDS: &'static [u8] = include_bytes!("/usr/share/dict/words");
247
248#[cfg(test)]
249mod tests {
250	use super::*;
251	use std::fmt::Debug;
252
253	fn read_until_empty_lines<B: Buffer>()
254	where
255		B::Error: Debug,
256		Error: From<B::Error>,
257	{
258		// two spaces, three spaces, two spaces
259		let mut r = BufRefReaderBuilder::new(&b"  lorem   ipsum  "[..])
260			.capacity(4)
261			.build::<B>()
262			.unwrap();
263		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
264		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
265		assert_eq!(r.read_until(b' ').unwrap(), Some(&b"lorem "[..]));
266		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
267		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
268		assert_eq!(r.read_until(b' ').unwrap(), Some(&b"ipsum "[..]));
269		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
270		assert_eq!(r.read_until(b' ').unwrap(), None);
271	}
272
273	#[test] fn read_until_empty_lines_vec()  { read_until_empty_lines::<VecBuffer>() }
274	#[test] fn read_until_empty_lines_mmap() { read_until_empty_lines::<MmapBuffer>() }
275
276	fn read_until_words<B: Buffer>()
277	where
278		B::Error: Debug,
279		Error: From<B::Error>,
280	{
281		let mut r = BufRefReaderBuilder::new(WORDS)
282			.capacity(4)
283			.build::<B>()
284			.unwrap();
285		let mut words = WORDS.split(|&c| c == b'\n');
286		while let Ok(Some(slice_buf)) = r.read_until(b'\n') {
287			let mut slice_words = words.next().unwrap()
288				.to_vec();
289			slice_words.push(b'\n');
290			assert_eq!(slice_buf, &slice_words[..]);
291		}
292
293		// reader: returned immediately after hitting EOF past last b'\n'
294		// words: this is .split(), hence empty string past last b'\n'
295		assert_eq!(words.next(), Some(&b""[..]));
296
297		assert_eq!(words.next(), None);
298	}
299
300	#[test] fn read_until_words_vec()  { read_until_words::<VecBuffer>() }
301	#[test] fn read_until_words_mmap() { read_until_words::<MmapBuffer>() }
302
303	// like read_until_words, but splits by rarest character, which is b'Q'
304	// also uses slightly bigger initial buffers
305	fn read_until_words_long<B: Buffer>()
306	where
307		B::Error: Debug,
308		Error: From<B::Error>,
309	{
310		let mut r = BufRefReaderBuilder::new(WORDS)
311			.capacity(32)
312			.build::<B>()
313			.unwrap();
314		let mut words = WORDS.split(|&c| c == b'Q').peekable();
315		while let Ok(Some(slice_buf)) = r.read_until(b'Q') {
316			let mut slice_words = words.next().unwrap()
317				.to_vec();
318			if words.peek() != None {
319				slice_words.push(b'Q');
320			}
321			assert_eq!(slice_buf, &slice_words[..]);
322		}
323
324		assert_eq!(words.next(), None);
325	}
326
327	#[test] fn read_until_words_long_vec()  { read_until_words_long::<VecBuffer>() }
328	#[test] fn read_until_words_long_mmap() { read_until_words_long::<MmapBuffer>() }
329
330	fn read<B: Buffer>()
331	where
332		B::Error: Debug,
333		Error: From<B::Error>,
334	{
335		let mut r = BufRefReaderBuilder::new(&b"lorem ipsum dolor sit amet"[..])
336			.capacity(4)
337			.build::<B>()
338			.unwrap();
339		assert_eq!(r.read(5).unwrap(), Some(&b"lorem"[..]));
340		assert_eq!(r.read(6).unwrap(), Some(&b" ipsum"[..]));
341		assert_eq!(r.read(1024).unwrap(), Some(&b" dolor sit amet"[..]));
342		assert_eq!(r.read(1).unwrap(), None);
343	}
344
345	#[test] fn read_vec()  { read::<VecBuffer>() }
346	#[test] fn read_mmap() { read::<MmapBuffer>() }
347
348	fn read_words<B: Buffer>(cap: usize, read: usize)
349	where
350		B::Error: Debug,
351		Error: From<B::Error>,
352	{
353		let mut r = BufRefReaderBuilder::new(WORDS)
354			.capacity(cap)
355			.build::<B>()
356			.unwrap();
357		let mut words = WORDS.chunks(read);
358		while let Ok(Some(slice_buf)) = r.read(read) {
359			let slice_words = words.next().unwrap();
360			assert_eq!(slice_buf, slice_words);
361		}
362		assert_eq!(words.next(), None);
363	}
364
365	#[test] fn read_words_vec_4x3() { read_words::<VecBuffer>(4, 3) }
366	#[test] fn read_words_vec_4x5() { read_words::<VecBuffer>(4, 5) }
367	#[test] fn read_words_mmap_4x3() { read_words::<MmapBuffer>(4, 3) }
368	#[test] fn read_words_mmap_4x5() { read_words::<MmapBuffer>(4, 5) }
369}