1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/*!
Faster, growable buffering reader for when there's little to no need to modify data, nor to keep it alive past next read.

`std::io::BufReader` works by copying data from its internal buffer into user-provided `Vec`/`String`,
or, in case of `.lines()`, by emitting new heap-allocated `String` for each iteration.
While convenient and versatile, this is not the fastest approach.

Instead, `BufRefReader` references its internal buffer with each read, returning `&[u8]`.
Lack of extra allocations yields better read performance in situations where most (if not all) of read data:

- requires no modifications,
- is never used outside of a loop body and does not need to be duplicated into the heap for future use.

While being more performant, this approach also severely limits applicability of this reader:

- it does not (and cannot) implement `BufRead` and cannot be used as a direct replacement for `BufReader`;
- returned values are only valid between calls to reading functions (i.e. they cannot outlive even a single loop cycle), and Rust's borrow checker will prevent you from using stale references;
- consequently, `BufRefReader` cannot be turned into an `Iterator` (here's an easy way to think about it: what would `Iterator::collect()` return?);
- returned references are immutable;
- obviously, there's also nothing that can return `String`s or `&str`s for you.

## Choice a of buffer

Use [`MmapBuffer`](struct.MmapBuffer.html) unless:

- [slice-deque](https://github.com/gnzlbg/slice_deque) is not available for your platform (e.g. no support for `mmap`),
- you need very small buffers (smaller than 1 memory page),
- you're about to create a lot of buffers in a short period of time ([`new()`](trait.Buffer.html#tymethod.new) is relatively expensive),
- you're expecting buffer to grow a lot (consider, if possible, preallocating larger buffers through [`BufRefReaderBuilder.capacity`](struct.BufRefReaderBuilder.html#method.capacity)),
- you have some very special concerns re: memory maps and malloc bypass (special allocators, possible kernel inefficiency due to large amount of mapped memory regions etc.).

## Examples

Read data word by word:

```
use buf_ref_reader::*;

fn read<B: Buffer>() -> Result<(), Error>
where
	Error: From<B::Error>,
	// add this if you plan to `unwrap()` errors returned by `read()` et al.
	//B::Error: std::fmt::Debug,
{
	// &[u8] implements Read, hence we use it as our data source for this example
	let data = b"lorem ipsum dolor sit amet";
	let mut r = BufRefReaderBuilder::new(&data[..])
		.capacity(4)
		.build::<B>()?;

	assert_eq!(r.read_until(b' ')?, Some(&b"lorem "[..]));
	assert_eq!(r.read_until(b' ')?, Some(&b"ipsum "[..]));
	assert_eq!(r.read_until(b' ')?, Some(&b"dolor "[..]));
	assert_eq!(r.read_until(b' ')?, Some(&b"sit "[..]));
	assert_eq!(r.read_until(b' ')?, Some(&b"amet"[..]));
	assert_eq!(r.read_until(b' ')?, None); // EOF
	assert_eq!(r.read_until(b' ')?, None);

	Ok(())
}

fn main() {
	read::<VecBuffer>().unwrap();
	read::<MmapBuffer>().unwrap();
}
```
*/

#![warn(missing_docs)]

use quick_error::quick_error;

use std::io::{self, Read};
use memchr::memchr;

mod buffer;
pub use buffer::{
	Buffer,
	VecBuffer,
	MmapBuffer,
};
use slice_deque::AllocError;

use std::convert::From;

/**
Buffering reader.

See [module-level docs](index.html) for examples.
*/
pub struct BufRefReader<R, B> {
	src: R,
	buf: B,
}

/**
Builder for [`BufRefReader`](struct.BufRefReader.html).

See [module-level docs](index.html) for examples.
*/
pub struct BufRefReaderBuilder<R> {
	src: R,
	bufsize: usize,
}
impl<R: Read> BufRefReaderBuilder<R> {
	/// Creates new builder with given reader and default options.
	pub fn new(src: R) -> Self {
		BufRefReaderBuilder {
			src,
			bufsize: 8192,
		}
	}

	/// Set initial buffer capacity.
	pub fn capacity(mut self, bufsize: usize) -> Self {
		self.bufsize = bufsize;
		self
	}

	/// Create actual reader.
	pub fn build<B: Buffer>(self) -> Result<BufRefReader<R, B>, B::Error> {
		Ok(BufRefReader {
			src: self.src,
			buf: B::new(self.bufsize)?,
		})
	}
}

quick_error! {
	/// Error type that reading functions might emit
	#[derive(Debug)]
	pub enum Error {
		/// Error reading from actual reader
		IO(err: io::Error) { from() }
		/// Indicates failure to create/grow buffer
		Buf(err: AllocError) { from() }
	}
}
impl From<()> for Error {
	// VecBuffer never emits errors, it only panics
	fn from(_: ()) -> Self {
		unimplemented!()
	}
}

impl<R: Read, B: Buffer> BufRefReader<R, B>
where Error: From<B::Error>
{
	/// Creates buffered reader with default options. Look for [`BufRefReaderBuilder`](struct.BufRefReaderBuilder.html) for tweaks.
	pub fn new(src: R) -> Result<BufRefReader<R, B>, B::Error> {
		BufRefReaderBuilder::new(src)
			.build()
	}

	// returns Some(where appended data starts within the filled part of the buffer),
	// or None for EOF
	#[inline]
	fn fill(&mut self) -> Result<Option<usize>, Error> {
		self.buf.enlarge()?;

		let old_len = self.buf.len();

		match self.src.read(self.buf.appendable())? {
			0 => Ok(None), // EOF
			n => {
				self.buf.grow(n);
				Ok(Some(old_len))
			}
		}
	}

	/**
	Returns requested amount of bytes, or less if EOF prevents reader from fulfilling the request.

	Returns:

	- `Ok(Some(data))` with, well, data,
	- `Ok(None)` if no more data is available,
	- `Err(err)`: see `std::io::Read::read()`
	*/
	#[inline]
	pub fn read(&mut self, n: usize) -> Result<Option<&[u8]>, Error> {
		while n > self.buf.len() {
			// fill and expand buffer until either:
			// - buffer starts holding the requested amount of data
			// - EOF is reached
			if self.fill()?.is_none() { break };
		}
		if self.buf.len() == 0 {
			// reading past EOF
			Ok(None)
		} else {
			let output = self.buf.consume(n);
			Ok(Some(output))
		}
	}

	/**
	Returns bytes up until and including `delim`, or until EOF mark. If no content is available, returns `None`.

	Returns:

	- `Ok(Some(data))` with, well, data,
	- `Ok(None)` if no more data is available,
	- `Err(err)`: see `std::io::Read::read()`
	*/
	#[inline]
	pub fn read_until(&mut self, delim: u8) -> Result<Option<&[u8]>, Error> {
		let mut len = None;
		// position within filled part of the buffer,
		// from which to continue search for character
		let mut pos = 0;
		loop {
			// fill and expand buffer until either:
			// - `delim` appears in the buffer
			// - EOF is reached
			if let Some(n) = memchr(delim, &self.buf.filled()[pos..]) {
				len = Some(pos+n);
				break;
			}
			pos = match self.fill()? {
				None => break, // EOF
				Some(pos) => pos,
			};
		}

		match len {
			None => { // EOF
				if self.buf.len() == 0 {
					Ok(None)
				} else {
					let output = self.buf.consume(self.buf.len());
					Ok(Some(output))
				}
			},
			Some(len) => {
				let len = len + 1; // also include matching delimiter
				let output = self.buf.consume(len);
				Ok(Some(output))
			},
		}
	}
}

#[cfg(test)]
static WORDS: &'static [u8] = include_bytes!("/usr/share/dict/words");

#[cfg(test)]
mod tests {
	use super::*;
	use std::fmt::Debug;

	fn read_until_empty_lines<B: Buffer>()
	where
		B::Error: Debug,
		Error: From<B::Error>,
	{
		// two spaces, three spaces, two spaces
		let mut r = BufRefReaderBuilder::new(&b"  lorem   ipsum  "[..])
			.capacity(4)
			.build::<B>()
			.unwrap();
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b"lorem "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b"ipsum "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), Some(&b" "[..]));
		assert_eq!(r.read_until(b' ').unwrap(), None);
	}

	#[test] fn read_until_empty_lines_vec()  { read_until_empty_lines::<VecBuffer>() }
	#[test] fn read_until_empty_lines_mmap() { read_until_empty_lines::<MmapBuffer>() }

	fn read_until_words<B: Buffer>()
	where
		B::Error: Debug,
		Error: From<B::Error>,
	{
		let mut r = BufRefReaderBuilder::new(WORDS)
			.capacity(4)
			.build::<B>()
			.unwrap();
		let mut words = WORDS.split(|&c| c == b'\n');
		while let Ok(Some(slice_buf)) = r.read_until(b'\n') {
			let mut slice_words = words.next().unwrap()
				.to_vec();
			slice_words.push(b'\n');
			assert_eq!(slice_buf, &slice_words[..]);
		}

		// reader: returned immediately after hitting EOF past last b'\n'
		// words: this is .split(), hence empty string past last b'\n'
		assert_eq!(words.next(), Some(&b""[..]));

		assert_eq!(words.next(), None);
	}

	#[test] fn read_until_words_vec()  { read_until_words::<VecBuffer>() }
	#[test] fn read_until_words_mmap() { read_until_words::<MmapBuffer>() }

	// like read_until_words, but splits by rarest character, which is b'Q'
	// also uses slightly bigger initial buffers
	fn read_until_words_long<B: Buffer>()
	where
		B::Error: Debug,
		Error: From<B::Error>,
	{
		let mut r = BufRefReaderBuilder::new(WORDS)
			.capacity(32)
			.build::<B>()
			.unwrap();
		let mut words = WORDS.split(|&c| c == b'Q').peekable();
		while let Ok(Some(slice_buf)) = r.read_until(b'Q') {
			let mut slice_words = words.next().unwrap()
				.to_vec();
			if words.peek() != None {
				slice_words.push(b'Q');
			}
			assert_eq!(slice_buf, &slice_words[..]);
		}

		assert_eq!(words.next(), None);
	}

	#[test] fn read_until_words_long_vec()  { read_until_words_long::<VecBuffer>() }
	#[test] fn read_until_words_long_mmap() { read_until_words_long::<MmapBuffer>() }

	fn read<B: Buffer>()
	where
		B::Error: Debug,
		Error: From<B::Error>,
	{
		let mut r = BufRefReaderBuilder::new(&b"lorem ipsum dolor sit amet"[..])
			.capacity(4)
			.build::<B>()
			.unwrap();
		assert_eq!(r.read(5).unwrap(), Some(&b"lorem"[..]));
		assert_eq!(r.read(6).unwrap(), Some(&b" ipsum"[..]));
		assert_eq!(r.read(1024).unwrap(), Some(&b" dolor sit amet"[..]));
		assert_eq!(r.read(1).unwrap(), None);
	}

	#[test] fn read_vec()  { read::<VecBuffer>() }
	#[test] fn read_mmap() { read::<MmapBuffer>() }

	fn read_words<B: Buffer>(cap: usize, read: usize)
	where
		B::Error: Debug,
		Error: From<B::Error>,
	{
		let mut r = BufRefReaderBuilder::new(WORDS)
			.capacity(cap)
			.build::<B>()
			.unwrap();
		let mut words = WORDS.chunks(read);
		while let Ok(Some(slice_buf)) = r.read(read) {
			let slice_words = words.next().unwrap();
			assert_eq!(slice_buf, slice_words);
		}
		assert_eq!(words.next(), None);
	}

	#[test] fn read_words_vec_4x3() { read_words::<VecBuffer>(4, 3) }
	#[test] fn read_words_vec_4x5() { read_words::<VecBuffer>(4, 5) }
	#[test] fn read_words_mmap_4x3() { read_words::<MmapBuffer>(4, 3) }
	#[test] fn read_words_mmap_4x5() { read_words::<MmapBuffer>(4, 5) }
}