easy_reader/
lib.rs

1// Copyright 2018 Michele Federici (@ps1dr3x) <michele@federici.tech>
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! # EasyReader
10//!
11//! The main goal of this library is to allow long navigations through the lines of large files, freely moving forwards and backwards or getting random lines without having to consume an iterator.
12//!
13//! Currently with Rust's standard library is possible to read a file line by line only through Lines (https://doc.rust-lang.org/std/io/trait.BufRead.html#method.lines), with which is impossible (or very expensive) to read backwards and to get random lines. Also, being an iterator, every line that has already been read is consumed and to get back to the same line you need to reinstantiate the reader and consume all the lines until the desired one (eg. in the case of the last line, all).
14//!
15//! **Notes:**
16//!
17//! EasyReader by default does not generate an index, it just searches for line terminators from time to time, this allows it to be used with very large files without "startup" times and excessive RAM consumption.
18//! However, the lack of an index makes the reading slower and does not allow to take random lines with a perfect distribution, for these reasons there's a method to generate it; the start time will be slower, but all the following readings will use it and will therefore be faster (excluding the index build time, reading times are a bit longer but still comparable to those of a sequential forward reading through Lines) and in the random reading case the lines will be taken with a perfect distribution.
19//! By the way, it's not advisable to generate the index for very large files, as an excessive RAM consumption could occur.
20//!
21//! ### Example: basic usage
22//!
23//! ```rust
24//! use easy_reader::EasyReader;
25//! use std::{
26//!     fs::File,
27//!     io::{
28//!         self,
29//!         Error
30//!     }
31//! };
32//!
33//! fn navigate() -> Result<(), Error> {
34//!     let file = File::open("resources/test-file-lf")?;
35//!     let mut reader = EasyReader::new(file)?;
36//!
37//!     // Generate index (optional)
38//!     reader.build_index();
39//!
40//!     // Move through the lines
41//!     println!("First line: {}", reader.next_line()?.unwrap());
42//!     println!("Second line: {}", reader.next_line()?.unwrap());
43//!     println!("First line: {}", reader.prev_line()?.unwrap());
44//!     #[cfg(feature = "rand")]
45//!     println!("Random line: {}", reader.random_line()?.unwrap());
46//!
47//!     // Iteration through the entire file (reverse)
48//!     reader.eof();
49//!     while let Some(line) = reader.prev_line()? {
50//!         println!("{}", line);
51//!     }
52//!
53//!     // You can always start/restart reading from the end of file (EOF)
54//!     reader.eof();
55//!     println!("Last line: {}", reader.prev_line()?.unwrap());
56//!     // Or the begin of file (BOF)
57//!     reader.bof();
58//!     println!("First line: {}", reader.next_line()?.unwrap());
59//!
60//!     Ok(())
61//! }
62//! ```
63//!
64//! ### Example: read random lines endlessly
65//!
66//! ```no_run
67//! use easy_reader::EasyReader;
68//! use std::{
69//!     fs::File,
70//!     io::{
71//!         self,
72//!         Error
73//!     }
74//! };
75//!
76//! fn navigate_forever() -> Result<(), Error> {
77//!     let file = File::open("resources/test-file-lf")?;
78//!     let mut reader = EasyReader::new(file)?;
79//!
80//!     // Generate index (optional)
81//!     reader.build_index();
82//!
83//!     #[cfg(feature = "rand")]
84//!     loop {
85//!         println!("{}", reader.random_line()?.unwrap());
86//!     }
87//!     Ok(())
88//! }
89//! ```
90
91use fnv::FnvHashMap;
92#[cfg(feature = "rand")]
93use rand::Rng;
94use std::io::{self, prelude::*, Error, ErrorKind, SeekFrom};
95
96const CR_BYTE: u8 = b'\r';
97const LF_BYTE: u8 = b'\n';
98
99#[derive(Clone, PartialEq)]
100enum ReadMode {
101    Prev,
102    Current,
103    Next,
104    #[cfg(feature = "rand")]
105    Random,
106}
107
108pub struct EasyReader<R> {
109    file: R,
110    file_size: u64,
111    chunk_size: usize,
112    current_start_line_offset: u64,
113    current_end_line_offset: u64,
114    indexed: bool,
115    offsets_index: Vec<(usize, usize)>,
116    newline_map: FnvHashMap<usize, usize>,
117}
118
119impl<R: Read + Seek> EasyReader<R> {
120    pub fn new(mut file: R) -> Result<Self, Error> {
121        let file_size = file.seek(SeekFrom::End(0))?;
122        if file_size == 0 {
123            return Err(Error::new(ErrorKind::UnexpectedEof, "Empty file"));
124        }
125
126        Ok(EasyReader {
127            file,
128            file_size,
129            chunk_size: 200,
130            current_start_line_offset: 0,
131            current_end_line_offset: 0,
132            indexed: false,
133            offsets_index: Vec::new(),
134            newline_map: FnvHashMap::default(),
135        })
136    }
137
138    pub fn chunk_size(&mut self, size: usize) -> &mut Self {
139        self.chunk_size = size;
140        self
141    }
142
143    pub fn bof(&mut self) -> &mut Self {
144        self.current_start_line_offset = 0;
145        self.current_end_line_offset = 0;
146        self
147    }
148
149    pub fn eof(&mut self) -> &mut Self {
150        self.current_start_line_offset = self.file_size;
151        self.current_end_line_offset = self.file_size;
152        self
153    }
154
155    pub fn build_index(&mut self) -> io::Result<&mut Self> {
156        if self.file_size > usize::max_value() as u64 {
157            // 32bit ¯\_(ツ)_/¯
158            return Err(Error::new(
159                ErrorKind::InvalidData,
160                "File too large to build an index",
161            ));
162        }
163
164        while let Ok(Some(_line)) = self.next_line() {
165            self.offsets_index.push((
166                self.current_start_line_offset as usize,
167                self.current_end_line_offset as usize,
168            ));
169            self.newline_map.insert(
170                self.current_start_line_offset as usize,
171                self.offsets_index.len() - 1,
172            );
173        }
174        self.indexed = true;
175        Ok(self)
176    }
177
178    pub fn prev_line(&mut self) -> io::Result<Option<String>> {
179        self.read_line(ReadMode::Prev)
180    }
181
182    pub fn current_line(&mut self) -> io::Result<Option<String>> {
183        self.read_line(ReadMode::Current)
184    }
185
186    pub fn next_line(&mut self) -> io::Result<Option<String>> {
187        self.read_line(ReadMode::Next)
188    }
189
190    #[cfg(feature = "rand")]
191    pub fn random_line(&mut self) -> io::Result<Option<String>> {
192        self.read_line(ReadMode::Random)
193    }
194
195    fn read_line(&mut self, mode: ReadMode) -> io::Result<Option<String>> {
196        match mode {
197            ReadMode::Prev => {
198                if self.current_start_line_offset == 0 {
199                    return Ok(None);
200                }
201
202                if self.indexed && self.current_start_line_offset < self.file_size {
203                    let current_line = *self
204                        .newline_map
205                        .get(&(self.current_start_line_offset as usize))
206                        .unwrap();
207                    self.current_start_line_offset = self.offsets_index[current_line - 1].0 as u64;
208                    self.current_end_line_offset = self.offsets_index[current_line - 1].1 as u64;
209                    return self.read_line(ReadMode::Current);
210                } else {
211                    self.current_end_line_offset = self.current_start_line_offset;
212                }
213            }
214            ReadMode::Current => {
215                if self.current_start_line_offset == self.current_end_line_offset {
216                    if self.current_start_line_offset == self.file_size {
217                        self.current_start_line_offset =
218                            self.find_start_line(ReadMode::Prev)? as u64;
219                    }
220                    if self.current_end_line_offset == 0 {
221                        self.current_end_line_offset = self.find_end_line()? as u64;
222                    }
223                }
224            }
225            ReadMode::Next => {
226                if self.current_end_line_offset == self.file_size {
227                    return Ok(None);
228                }
229
230                if self.indexed && self.current_start_line_offset > 0 {
231                    let current_line = *self
232                        .newline_map
233                        .get(&(self.current_start_line_offset as usize))
234                        .unwrap();
235                    self.current_start_line_offset = self.offsets_index[current_line + 1].0 as u64;
236                    self.current_end_line_offset = self.offsets_index[current_line + 1].1 as u64;
237                    return self.read_line(ReadMode::Current);
238                } else {
239                    self.current_start_line_offset = self.current_end_line_offset;
240                }
241            }
242            #[cfg(feature = "rand")]
243            ReadMode::Random => {
244                if self.indexed {
245                    let rnd_idx = rand::thread_rng().gen_range(0..self.offsets_index.len() - 1);
246                    self.current_start_line_offset = self.offsets_index[rnd_idx].0 as u64;
247                    self.current_end_line_offset = self.offsets_index[rnd_idx].1 as u64;
248                    return self.read_line(ReadMode::Current);
249                } else {
250                    self.current_start_line_offset =
251                        rand::thread_rng().gen_range(0..self.file_size);
252                }
253            }
254        }
255
256        if mode != ReadMode::Current {
257            self.current_start_line_offset = self.find_start_line(mode)?;
258            self.current_end_line_offset = self.find_end_line()?;
259        }
260
261        let offset = self.current_start_line_offset;
262        let line_length = self.current_end_line_offset - self.current_start_line_offset;
263        let buffer = self.read_bytes(offset, line_length as usize)?;
264
265        let line = String::from_utf8(buffer)
266            .map_err(|err| {
267                Error::new(
268                    ErrorKind::Other,
269                    format!(
270                        "The line starting at byte: {} and ending at byte: {} is not valid UTF-8. Conversion error: {}",
271                        self.current_start_line_offset,
272                        self.current_end_line_offset,
273                        err
274                    )
275                )
276            })?;
277
278        Ok(Some(line))
279    }
280
281    fn find_start_line(&mut self, mode: ReadMode) -> io::Result<u64> {
282        let mut new_start_line_offset = self.current_start_line_offset;
283
284        let mut n_chunks = 0;
285        loop {
286            if new_start_line_offset == 0 {
287                break;
288            }
289
290            let mut found = false;
291            match mode {
292                ReadMode::Current => (),
293                ReadMode::Next => {
294                    let chunk = self.read_chunk(new_start_line_offset)?;
295
296                    for chunk_el in chunk.iter().take(self.chunk_size) {
297                        if *chunk_el == LF_BYTE {
298                            found = true;
299                        }
300
301                        new_start_line_offset += 1;
302                        if found {
303                            break;
304                        }
305                    }
306                }
307                _ => {
308                    let mut margin = 0;
309                    let from = {
310                        if new_start_line_offset < (self.chunk_size as u64) {
311                            margin = self.chunk_size - (new_start_line_offset as usize);
312                            0
313                        } else {
314                            new_start_line_offset - (self.chunk_size as u64)
315                        }
316                    };
317
318                    let mut chunk = self.read_chunk(from)?;
319                    chunk.reverse();
320
321                    for (i, chunk_el) in chunk.iter().enumerate().take(self.chunk_size) {
322                        if i < margin {
323                            continue;
324                        }
325                        if new_start_line_offset == 0 {
326                            found = true;
327                            break;
328                        } else {
329                            if n_chunks == 0
330                                && self.current_start_line_offset == new_start_line_offset
331                            {
332                                #[cfg(feature = "rand")]
333                                {
334                                    if mode != ReadMode::Random {
335                                        // Not moved yet
336                                        new_start_line_offset -= 1;
337                                        continue;
338                                    }
339                                }
340                                #[cfg(not(feature = "rand"))]
341                                {
342                                    // Not moved yet
343                                    new_start_line_offset -= 1;
344                                    continue;
345                                }
346                            }
347
348                            if *chunk_el == LF_BYTE {
349                                found = true;
350                            }
351                        }
352
353                        if found {
354                            break;
355                        }
356                        new_start_line_offset -= 1;
357                    }
358                }
359            }
360
361            if found {
362                break;
363            }
364            n_chunks += 1;
365        }
366
367        Ok(new_start_line_offset)
368    }
369
370    fn find_end_line(&mut self) -> io::Result<u64> {
371        let mut new_end_line_offset = self.current_start_line_offset;
372
373        loop {
374            if new_end_line_offset == self.file_size {
375                break;
376            }
377
378            let chunk = self.read_chunk(new_end_line_offset)?;
379
380            let mut found = false;
381            for i in 0..self.chunk_size {
382                if new_end_line_offset == self.file_size {
383                    found = true;
384                    break;
385                } else if chunk[i] == LF_BYTE {
386                    // Handle CRLF files
387                    if i > 0 {
388                        if chunk[i - 1] == CR_BYTE {
389                            new_end_line_offset -= 1;
390                        }
391                    } else if new_end_line_offset < self.file_size && new_end_line_offset > 0 {
392                        let next_byte = self.read_bytes(new_end_line_offset - 1, 1)?[0];
393                        if next_byte == CR_BYTE {
394                            new_end_line_offset -= 1;
395                        }
396                    }
397                    found = true;
398                    break;
399                } else {
400                    new_end_line_offset += 1;
401                }
402            }
403            if found {
404                break;
405            }
406        }
407
408        Ok(new_end_line_offset)
409    }
410
411    fn read_chunk(&mut self, offset: u64) -> io::Result<Vec<u8>> {
412        let chunk_size = self.chunk_size;
413        self.read_bytes(offset, chunk_size)
414    }
415
416    fn read_bytes(&mut self, offset: u64, bytes: usize) -> io::Result<Vec<u8>> {
417        let mut buffer = vec![0; bytes];
418        self.file.seek(SeekFrom::Start(offset as u64))?;
419        let _ = self.file.read(&mut buffer)?;
420        Ok(buffer)
421    }
422}
423
424#[cfg(test)]
425mod tests;