1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
//! `Bytelines` is a simple library crate which offers line iteration for
//! `BufRead` via `&[u8]` rather than `String`.
//!
//! Due to the removal of checking for `String` validity, this is typically
//! much faster for reading in raw data and much more flexible. The APIs
//! offered in this crate are intended to function exactly the same as the
//! `lines` function inside the `BufRead` trait, except that the bytes which
//! precede the line delimiter are not validated.
//!
//! Performance of [ByteLines](enum.ByteLines.html) is very close to that of
//! writing a `loop` manually, whereas [RefByteLines](enum.RefByteLines.html)
//! is practically identical due to the avoidance of "unnecessary" allocations.
use std::io::BufRead;
use std::marker::PhantomData;

/// Represents anything which can provide iterators of byte lines.
pub trait ByteLinesReader<'a, B>
where
    B: BufRead,
{
    /// Returns an iterator over the lines of this reader (as `Vec<u8>`).
    ///
    /// Just like the equivalent in the standard library, the iterator returned
    /// from this function will yield instances of `io::Result<String>`. Each
    /// string returned will not have a newline byte (the 0xA byte) or CRLF
    /// (0xD, 0xA bytes) at the end.
    fn byte_lines(self) -> ByteLines<'a, B>;

    /// Returns an iterator over the lines of this reader (as `&[u8]`).
    ///
    /// This method operates in the same way as [byte_lines](#method.byte_lines),
    /// except that the iterated values are references to the internal byte buffer.
    /// Due to this, you can only safely hold a single line at any given time, and
    /// as such this method is marked as `unsafe`. If you're using usual loop syntax
    /// of `for $x in $y` your code will not come across this unsafe contract.
    ///
    /// When performance is important, this method should be used rather than
    /// [byte_lines](#method.byte_lines) as there is only a single buffer
    /// allocation (disregarding any potential resizing that may be required),
    /// whereas [byte_lines](#method.byte_lines) will allocate a `Vec<u8>` for
    /// each input line and provide ownership.
    unsafe fn ref_byte_lines(self) -> RefByteLines<'a, B>;
}

/// Blanket implementation for all `BufRead`.
impl<'a, B> ByteLinesReader<'a, B> for B
where
    B: BufRead,
{
    /// Returns an iterator over the lines of this reader (as `Vec<u8>`).
    fn byte_lines(self) -> ByteLines<'a, Self> {
        ByteLines {
            inner: unsafe { self.ref_byte_lines() },
        }
    }

    /// Returns an iterator over the lines of this reader (as `&[u8]`).
    unsafe fn ref_byte_lines(self) -> RefByteLines<'a, Self> {
        RefByteLines {
            buffer: Vec::new(),
            reader: self,
            marker: PhantomData,
        }
    }
}

/// Provides a safe iterator over lines of input as byte vectors (`Vec<u8>`).
///
/// Internally, this iterator delegates to `RefByteLines` - the only difference
/// being that this iterator will allocate a vector for each reference returned,
/// thus making ownership clear and avoiding any issues with data races.
pub struct ByteLines<'a, B>
where
    B: BufRead,
{
    inner: RefByteLines<'a, B>,
}

/// Provides an iterator over lines of input as byte slices (`&[u8]`).
///
/// This iterator requires opting in to the use of unsafe code, as there is a
/// potential data race if you call `next()` on the iterator twice. This iterator
/// should only be used in a traditional `for $x in $y` syntax, otherwise values
/// cannot be relied upon as being consistent.
///
/// Here is a demonstration of this issue in action using a very basic clash of
/// the same length. Note that you might (in some cases) get mixed input if you
/// went from a longer length value to a shorter length.
///
/// ```rust
/// use bytelines::*;
/// use std::fs::File;
/// use std::io::BufReader;
///
/// unsafe {
///     // construct our iterator from our file input
///     let file = File::open("./res/numbers.txt").unwrap();
///     let mut iter = BufReader::new(file).ref_byte_lines();
///
///     // take the first line from the input
///     let line1 = iter.next();
///     println!("{:?}", line1); // equivalent to bytes of "0"
///
///     // take the second line from the input
///     let line2 = iter.next();
///     println!("{:?}", line2); // equivalent to bytes of "1"
///     println!("{:?}", line1); // also now equivalent to bytes of "1"
/// }
/// ```
///
/// This implmentation is much more memory efficient than `ByteLines` (and more
/// performant), and so should be used in performance critical code blocks. As
/// a small aside, `ByteLines` simply delegates to this struct internally and
/// provides an allocation on top to enforce all ownership correctly.
pub struct RefByteLines<'a, B>
where
    B: BufRead,
{
    buffer: Vec<u8>,
    marker: PhantomData<&'a B>,
    reader: B,
}

/// Wrapping iterator to enforce ownership.
impl<'a, B> Iterator for ByteLines<'a, B>
where
    B: BufRead,
{
    type Item = Result<Vec<u8>, std::io::Error>;

    /// Retrieves the next line in the iterator (if any).
    fn next(&mut self) -> Option<Result<Vec<u8>, std::io::Error>> {
        self.inner.next().map(|r| r.map(|s| s.to_vec()))
    }
}

/// Base iterator for line retrieval.
impl<'a, B> Iterator for RefByteLines<'a, B>
where
    B: BufRead,
{
    type Item = Result<&'a [u8], std::io::Error>;

    /// Retrieves the next line in the iterator (if any).
    fn next(&mut self) -> Option<Result<&'a [u8], std::io::Error>> {
        // clear the main buffer
        self.buffer.clear();

        // iterate every line coming from the reader (but as bytes)
        match self.reader.read_until(b'\n', &mut self.buffer) {
            // short circuit on error
            Err(e) => Some(Err(e)),

            // no input, done
            Ok(0) => None,

            // bytes!
            Ok(mut n) => {
                // always "pop" the delim
                if self.buffer[n - 1] == b'\n' {
                    n -= 1;
                    // also "pop" a leading \r
                    if self.buffer[n - 1] == b'\r' {
                        n -= 1;
                    }
                }

                // Here's the fun unsafe section; in order to provide a reference and avoid allocation,
                // we need to extend the lifetime and so we do so here. This means that you're open to
                // data races in the case you call `next` on an iterator twice, and maintain the values
                // of each retrieved line (as the former will be invalidated to point to the bytes of
                // the second). To avoid this, simply always use `for $x in $y` syntax when using this
                // type of iteration directly (as you're never going to hold two lines at once).
                unsafe {
                    Some(Ok(std::mem::transmute::<&[u8], &'a [u8]>(
                        &self.buffer[..n],
                    )))
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs::File;
    use std::io::BufReader;

    #[test]
    fn test_basic_iterator() {
        let file = File::open("./res/numbers.txt").unwrap();

        let lines: Vec<String> = BufReader::new(file)
            .byte_lines()
            .map(|line| line.unwrap())
            .map(|line| String::from_utf8(line).unwrap())
            .collect();

        for i in 0..9 {
            assert_eq!(lines[i], format!("{}", i));
        }
    }
}