1use fnv::FnvHashMap;
92#[cfg(feature = "rand")]
93use rand::Rng;
94use std::io::{self, prelude::*, Error, ErrorKind, SeekFrom};
95
96const CR_BYTE: u8 = b'\r';
97const LF_BYTE: u8 = b'\n';
98
99#[derive(Clone, PartialEq)]
100enum ReadMode {
101 Prev,
102 Current,
103 Next,
104 #[cfg(feature = "rand")]
105 Random,
106}
107
108pub struct EasyReader<R> {
109 file: R,
110 file_size: u64,
111 chunk_size: usize,
112 current_start_line_offset: u64,
113 current_end_line_offset: u64,
114 indexed: bool,
115 offsets_index: Vec<(usize, usize)>,
116 newline_map: FnvHashMap<usize, usize>,
117}
118
119impl<R: Read + Seek> EasyReader<R> {
120 pub fn new(mut file: R) -> Result<Self, Error> {
121 let file_size = file.seek(SeekFrom::End(0))?;
122 if file_size == 0 {
123 return Err(Error::new(ErrorKind::UnexpectedEof, "Empty file"));
124 }
125
126 Ok(EasyReader {
127 file,
128 file_size,
129 chunk_size: 200,
130 current_start_line_offset: 0,
131 current_end_line_offset: 0,
132 indexed: false,
133 offsets_index: Vec::new(),
134 newline_map: FnvHashMap::default(),
135 })
136 }
137
138 pub fn chunk_size(&mut self, size: usize) -> &mut Self {
139 self.chunk_size = size;
140 self
141 }
142
143 pub fn bof(&mut self) -> &mut Self {
144 self.current_start_line_offset = 0;
145 self.current_end_line_offset = 0;
146 self
147 }
148
149 pub fn eof(&mut self) -> &mut Self {
150 self.current_start_line_offset = self.file_size;
151 self.current_end_line_offset = self.file_size;
152 self
153 }
154
155 pub fn build_index(&mut self) -> io::Result<&mut Self> {
156 if self.file_size > usize::max_value() as u64 {
157 return Err(Error::new(
159 ErrorKind::InvalidData,
160 "File too large to build an index",
161 ));
162 }
163
164 while let Ok(Some(_line)) = self.next_line() {
165 self.offsets_index.push((
166 self.current_start_line_offset as usize,
167 self.current_end_line_offset as usize,
168 ));
169 self.newline_map.insert(
170 self.current_start_line_offset as usize,
171 self.offsets_index.len() - 1,
172 );
173 }
174 self.indexed = true;
175 Ok(self)
176 }
177
178 pub fn prev_line(&mut self) -> io::Result<Option<String>> {
179 self.read_line(ReadMode::Prev)
180 }
181
182 pub fn current_line(&mut self) -> io::Result<Option<String>> {
183 self.read_line(ReadMode::Current)
184 }
185
186 pub fn next_line(&mut self) -> io::Result<Option<String>> {
187 self.read_line(ReadMode::Next)
188 }
189
190 #[cfg(feature = "rand")]
191 pub fn random_line(&mut self) -> io::Result<Option<String>> {
192 self.read_line(ReadMode::Random)
193 }
194
195 fn read_line(&mut self, mode: ReadMode) -> io::Result<Option<String>> {
196 match mode {
197 ReadMode::Prev => {
198 if self.current_start_line_offset == 0 {
199 return Ok(None);
200 }
201
202 if self.indexed && self.current_start_line_offset < self.file_size {
203 let current_line = *self
204 .newline_map
205 .get(&(self.current_start_line_offset as usize))
206 .unwrap();
207 self.current_start_line_offset = self.offsets_index[current_line - 1].0 as u64;
208 self.current_end_line_offset = self.offsets_index[current_line - 1].1 as u64;
209 return self.read_line(ReadMode::Current);
210 } else {
211 self.current_end_line_offset = self.current_start_line_offset;
212 }
213 }
214 ReadMode::Current => {
215 if self.current_start_line_offset == self.current_end_line_offset {
216 if self.current_start_line_offset == self.file_size {
217 self.current_start_line_offset =
218 self.find_start_line(ReadMode::Prev)? as u64;
219 }
220 if self.current_end_line_offset == 0 {
221 self.current_end_line_offset = self.find_end_line()? as u64;
222 }
223 }
224 }
225 ReadMode::Next => {
226 if self.current_end_line_offset == self.file_size {
227 return Ok(None);
228 }
229
230 if self.indexed && self.current_start_line_offset > 0 {
231 let current_line = *self
232 .newline_map
233 .get(&(self.current_start_line_offset as usize))
234 .unwrap();
235 self.current_start_line_offset = self.offsets_index[current_line + 1].0 as u64;
236 self.current_end_line_offset = self.offsets_index[current_line + 1].1 as u64;
237 return self.read_line(ReadMode::Current);
238 } else {
239 self.current_start_line_offset = self.current_end_line_offset;
240 }
241 }
242 #[cfg(feature = "rand")]
243 ReadMode::Random => {
244 if self.indexed {
245 let rnd_idx = rand::thread_rng().gen_range(0..self.offsets_index.len() - 1);
246 self.current_start_line_offset = self.offsets_index[rnd_idx].0 as u64;
247 self.current_end_line_offset = self.offsets_index[rnd_idx].1 as u64;
248 return self.read_line(ReadMode::Current);
249 } else {
250 self.current_start_line_offset =
251 rand::thread_rng().gen_range(0..self.file_size);
252 }
253 }
254 }
255
256 if mode != ReadMode::Current {
257 self.current_start_line_offset = self.find_start_line(mode)?;
258 self.current_end_line_offset = self.find_end_line()?;
259 }
260
261 let offset = self.current_start_line_offset;
262 let line_length = self.current_end_line_offset - self.current_start_line_offset;
263 let buffer = self.read_bytes(offset, line_length as usize)?;
264
265 let line = String::from_utf8(buffer)
266 .map_err(|err| {
267 Error::new(
268 ErrorKind::Other,
269 format!(
270 "The line starting at byte: {} and ending at byte: {} is not valid UTF-8. Conversion error: {}",
271 self.current_start_line_offset,
272 self.current_end_line_offset,
273 err
274 )
275 )
276 })?;
277
278 Ok(Some(line))
279 }
280
281 fn find_start_line(&mut self, mode: ReadMode) -> io::Result<u64> {
282 let mut new_start_line_offset = self.current_start_line_offset;
283
284 let mut n_chunks = 0;
285 loop {
286 if new_start_line_offset == 0 {
287 break;
288 }
289
290 let mut found = false;
291 match mode {
292 ReadMode::Current => (),
293 ReadMode::Next => {
294 let chunk = self.read_chunk(new_start_line_offset)?;
295
296 for chunk_el in chunk.iter().take(self.chunk_size) {
297 if *chunk_el == LF_BYTE {
298 found = true;
299 }
300
301 new_start_line_offset += 1;
302 if found {
303 break;
304 }
305 }
306 }
307 _ => {
308 let mut margin = 0;
309 let from = {
310 if new_start_line_offset < (self.chunk_size as u64) {
311 margin = self.chunk_size - (new_start_line_offset as usize);
312 0
313 } else {
314 new_start_line_offset - (self.chunk_size as u64)
315 }
316 };
317
318 let mut chunk = self.read_chunk(from)?;
319 chunk.reverse();
320
321 for (i, chunk_el) in chunk.iter().enumerate().take(self.chunk_size) {
322 if i < margin {
323 continue;
324 }
325 if new_start_line_offset == 0 {
326 found = true;
327 break;
328 } else {
329 if n_chunks == 0
330 && self.current_start_line_offset == new_start_line_offset
331 {
332 #[cfg(feature = "rand")]
333 {
334 if mode != ReadMode::Random {
335 new_start_line_offset -= 1;
337 continue;
338 }
339 }
340 #[cfg(not(feature = "rand"))]
341 {
342 new_start_line_offset -= 1;
344 continue;
345 }
346 }
347
348 if *chunk_el == LF_BYTE {
349 found = true;
350 }
351 }
352
353 if found {
354 break;
355 }
356 new_start_line_offset -= 1;
357 }
358 }
359 }
360
361 if found {
362 break;
363 }
364 n_chunks += 1;
365 }
366
367 Ok(new_start_line_offset)
368 }
369
370 fn find_end_line(&mut self) -> io::Result<u64> {
371 let mut new_end_line_offset = self.current_start_line_offset;
372
373 loop {
374 if new_end_line_offset == self.file_size {
375 break;
376 }
377
378 let chunk = self.read_chunk(new_end_line_offset)?;
379
380 let mut found = false;
381 for i in 0..self.chunk_size {
382 if new_end_line_offset == self.file_size {
383 found = true;
384 break;
385 } else if chunk[i] == LF_BYTE {
386 if i > 0 {
388 if chunk[i - 1] == CR_BYTE {
389 new_end_line_offset -= 1;
390 }
391 } else if new_end_line_offset < self.file_size && new_end_line_offset > 0 {
392 let next_byte = self.read_bytes(new_end_line_offset - 1, 1)?[0];
393 if next_byte == CR_BYTE {
394 new_end_line_offset -= 1;
395 }
396 }
397 found = true;
398 break;
399 } else {
400 new_end_line_offset += 1;
401 }
402 }
403 if found {
404 break;
405 }
406 }
407
408 Ok(new_end_line_offset)
409 }
410
411 fn read_chunk(&mut self, offset: u64) -> io::Result<Vec<u8>> {
412 let chunk_size = self.chunk_size;
413 self.read_bytes(offset, chunk_size)
414 }
415
416 fn read_bytes(&mut self, offset: u64, bytes: usize) -> io::Result<Vec<u8>> {
417 let mut buffer = vec![0; bytes];
418 self.file.seek(SeekFrom::Start(offset as u64))?;
419 let _ = self.file.read(&mut buffer)?;
420 Ok(buffer)
421 }
422}
423
424#[cfg(test)]
425mod tests;