1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
// This file is a part of the mori - Material Orientation Library in Rust
// Copyright 2018 Robert Carson
//
//    Licensed under the Apache License, Version 2.0 (the "License");
//    you may not use this file except in compliance with the License.
//    You may obtain a copy of the License at
//
//        http://www.apache.org/licenses/LICENSE-2.0
//
//    Unless required by applicable law or agreed to in writing, software
//    distributed under the License is distributed on an "AS IS" BASIS,
//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//    See the License for the specific language governing permissions and
//    limitations under the License.

use bytecount;
use lexical;
use memchr::memchr2_iter;
use std::fs::File;
use std::io::{BufRead, BufReader, Read, SeekFrom};
use std::str;
use std::str::FromStr;
use std::vec::*;

use failure::Error;
// use failure::err_msg;

#[macro_use]
#[doc(hidden)]
mod macro_src;

pub mod float_reader;
pub mod int_reader;
pub mod parser;
pub mod prim_reader;
pub mod uint_reader;

pub use self::float_reader::*;
pub use self::int_reader::*;
pub use self::macro_src::*;
pub use self::parser::parse_txt;
pub use self::prim_reader::*;
pub use self::uint_reader::*;

//This value is similar in value to the one found in BurntSushi's CSV buffer size
//Our's is just 4x as large.
const BUF_SIZE: usize = 8 * (1 << 12);
///The type of delimiter that we can use
pub enum Delimiter {
    WhiteSpace,
    Any(u8),
}

///ReaderParams tells us what our reader should be doing.
///
///delimiter - the delimiter that tells us what our data fields are seperated by
///
/// skip_header - an optional field that tells us whether or not we should skip so many lines that are not
///     comment lines from the beginning of the file
///
/// skip_footer - an optional field that tells us whether or not we should skip so many lines that are not
///     comment lines from the end of the file
///
/// usecols - an optional field that tells us what column numbers we should be using from the data field
///     where these values should be >= 1. We don't use 0 indexing for these values.
///
/// max_rows - an optional field that tells us the maximum number of rows we should use from the file
pub struct ReaderParams {
    pub comments: Option<u8>,
    pub delimiter: Delimiter,
    pub skip_header: Option<usize>,
    pub skip_footer: Option<usize>,
    pub usecols: Option<Vec<usize>>,
    pub max_rows: Option<usize>,
}

///You can use the default constructor like this:
///
///let params = ReaderParams::default(); or you could do
///
///something like -     
///let params = ReaderParams{
///        comments: Some(b'%'),
///        ..Default::default()
///};
impl Default for ReaderParams {
    fn default() -> ReaderParams {
        ReaderParams {
            comments: Some(b'#'),
            delimiter: Delimiter::WhiteSpace,
            skip_header: None,
            skip_footer: None,
            usecols: None,
            max_rows: None,
        }
    }
}

///A structure that contains all of the results. It tells us the number of fields we had
///along with the number of lines that we read. Finally, the results are stored in a single Vec of
///type T. Type T is what type one called load_txt_* for.
#[derive(Debug, Clone)]
pub struct ReaderResults<T: FromStr> {
    pub num_fields: usize,
    pub num_lines: usize,
    pub results: Vec<T>,
}

///A structure that contains all of the raw results. It tells us the number of fields we had
///along with the number of lines that we read. Results contains all of the data that was read in
///from the file in its raw u8 format. The index field contains the starting index for each field
///that was read in.
pub struct RawReaderResults {
    pub num_fields: usize,
    pub num_lines: usize,
    pub results: Vec<u8>,
    pub index: Vec<usize>,
}

///A private function that counts the number of lines that match a specified character specified to it.
///It is assummed that this character only appears once per line.
fn count_lines(buf: &[u8], eol: u8) -> usize {
    bytecount::count(buf, eol) as usize
}

///It simply reads all of the lines in the file when an end of line is denoted by \n.
///It does not take into account whether any line is a comment or not.
pub fn read_num_file_tot_lines(f: &mut File) -> usize {
    let mut buffer = vec![0u8; BUF_SIZE];
    let mut count = 0;

    loop {
        let length = f.read(buffer.as_mut_slice()).unwrap();
        count += count_lines(&buffer[0..length], b'\n');
        if length < BUF_SIZE {
            break;
        }
    }

    count
}

///It simply reads all of the lines in the file when an end of line is denoted by \n or \r.
///A comment character is provided and if it is seen then before any nonwhite space the line is not counted in the total.
pub fn read_num_file_lines(f: &File, com: u8) -> usize {
    let mut count = 0;
    //We're explicitly using the raw bytes here
    let mut reader = BufReader::with_capacity(BUF_SIZE, f);
    //We loop over until the file has been completely read
    loop {
        //We first find the length of our buffer
        let length = {
            //We fill the buffer up. Our buffer is mutable which is why it's in this block
            let buffer = reader.fill_buf().unwrap();
            //We're now going to use an explicit loop.
            //I know this isn't idiomatic rust, but I couldn't really see a good way of skipping my iterator
            //to a location of my choosing.
            let mut i = 0;
            //We're using the memchr crate to locate all of the most common newline characters
            //It provides a nice iterator over our buffer that we can now use.
            let mut newline = memchr2_iter(b'\n', b'\r', buffer);
            //We don't want our loop index to go past our buffer length or else bad things could occur
            let length = buffer.len();
            //Keeping it old school with some nice wild loops
            while i < length {
                //Here's where the main magic occurs
                //If we come across a space or tab we move to the next item in the buffer
                //If we come across a newline character we advance our iterator and move onto the
                //next index essentially
                //If we come across a comment character first (white spaces aren't counted) we completely skip the line
                //If we come across any other character first (white spaces aren't counted) we increment our line counter
                //and then skip the rest of the contents of the line.
                //If we no longer have an item in our newline iterator we're done with everything in our buffer, and so
                //we can exit the loop.
                if (buffer[i] == b' ') | (buffer[i] == b'\t') {
                    i += 1;
                } else if (buffer[i] == b'\n') | (buffer[i] == b'\r') {
                    let val = newline.next();
                    i = match val {
                        Some(val) => val + 1,
                        None => length,
                    };
                } else if buffer[i] == com {
                    let val = newline.next();
                    i = match val {
                        Some(val) => val + 1,
                        None => length,
                    };
                } else {
                    count += 1;
                    let val = newline.next();
                    i = match val {
                        Some(val) => val + 1,
                        None => length,
                    };
                }
            }
            //Pass off our length to set our length outside of this block of code
            length
        };
        //We now need to consume everything in our buffer, so it's marked off as no longer being needed
        reader.consume(length);
        //If our length is less than our fixed buffer size we've reached the end of our file and can now exit.
        if length < BUF_SIZE {
            break;
        }
    }
    //Finally, we return our line count to the main code.
    count
}