1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
//! Utilities for creating collections
use std::io::prelude::*;
use std::io::{SeekFrom,BufReader,Error};
use std::fs::{File,metadata};

use tange::deferred::{Deferred, batch_apply};

use collection::memory::MemoryCollection;

#[derive(Clone)]
struct Chunk { path: String, start: u64, end: u64 }

/// Reads a new-line delimited text file, creating a new partition every `chunk_size`
pub fn read_text(path: &str, chunk_size: u64) -> Result<MemoryCollection<String>,Error> {
    // Read the file size
    let file_size = metadata(path)?.len();
    let mut dfs = Vec::new();
    let mut cur_offset = 0u64;
    while cur_offset < file_size {
        let chunk = Chunk {
            path: path.into(),
            start: cur_offset,
            end: cur_offset + chunk_size
        };
        dfs.push(Deferred::lift(chunk, 
                                Some(&format!("File: {}, start: {}", path, cur_offset))));
        cur_offset += chunk_size;
    }

    Ok(MemoryCollection::from_defs(batch_apply(&dfs, read)))
}

fn read(_idx: usize, chunk: &Chunk) -> Vec<String> {
    let f = File::open(&chunk.path)
        .expect("Error when opening file");
    let mut reader = BufReader::new(f);
    reader.seek(SeekFrom::Start(chunk.start))
        .expect("Error when reading file!");

    let mut start = if chunk.start > 0 { 
        // Skip first line, which is likely a partial line
        let mut s = Vec::new();
        let size = reader.read_until(b'\n', &mut s)
            .expect("Error reading line from file!");
        chunk.start + size as u64
    } else {
        0
    };

    let total = chunk.end;
    let mut lines = Vec::new();
    loop {
        let mut s = String::new();
        match reader.read_line(&mut s) {
            Ok(0) => break,
            Ok(size) => {
                start += size as u64;
                s.shrink_to_fit();
                lines.push(s);
            },
            _ => break
        };
        if start > total { break; }
    }
    lines.shrink_to_fit();
    lines
}