use std::fs::File;
use std::io::{BufRead, BufReader, Error};
use std::ops::Range;
use std::path::Path;
type IterItem = Result<Record, Error>;
pub struct Tabfile {
reader: BufReader<File>,
separator: char,
comment_character: Option<char>,
skip_lines: usize,
skip_empty_lines: bool,
}
impl Tabfile {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Tabfile, Error> {
let fd = File::open(path)?;
Ok(Tabfile {
reader: BufReader::new(fd),
separator: '\t',
comment_character: Some('#'),
skip_lines: 0,
skip_empty_lines: true,
})
}
pub fn separator(mut self, sep: char) -> Self {
self.separator = sep;
self
}
pub fn skip_lines(mut self, num_lines: usize) -> Self {
self.skip_lines = num_lines;
self
}
pub fn comment_character(mut self, comment_character: char) -> Self {
self.comment_character = Some(comment_character);
self
}
pub fn skip_empty_lines(mut self, skip: bool) -> Self {
self.skip_empty_lines = skip;
self
}
}
impl IntoIterator for Tabfile {
type Item = IterItem;
type IntoIter = RowIterator;
fn into_iter(self) -> Self::IntoIter {
RowIterator::new(self)
}
}
pub struct RowIterator {
tabfile: Tabfile,
next_line_number: usize,
}
impl RowIterator {
fn new(tabfile: Tabfile) -> RowIterator {
RowIterator {
tabfile,
next_line_number: 0,
}
}
}
impl Iterator for RowIterator {
type Item = IterItem;
fn next(&mut self) -> Option<Self::Item> {
loop {
let mut line = String::new();
match self.tabfile.reader.read_line(&mut line) {
Ok(line_length) => {
if self.tabfile.skip_lines > 0 {
self.tabfile.skip_lines -= 1;
self.next_line_number += 1;
continue;
}
if line_length == 0 {
return None; } else {
if let Some(comment_char) = self.tabfile.comment_character {
if line.starts_with(comment_char) {
self.next_line_number += 1;
continue; }
}
if line.trim() == "" && self.tabfile.skip_empty_lines {
self.next_line_number += 1;
continue;
}
self.next_line_number += 1; return Some(Ok(Record::new(
line,
self.next_line_number,
self.tabfile.separator,
)));
}
}
Err(e) => return Some(Err(e)),
}
}
}
}
pub struct Record {
line_number: usize,
line: String,
ranges: Vec<Range<usize>>,
}
impl Record {
fn new(line: String, line_number: usize, separator: char) -> Record {
let mut slice_start = 0;
let mut slice_end = 0;
let mut seen_newline = false;
let mut ranges = Vec::new();
for c in line.chars() {
if c == separator {
ranges.push(slice_start..slice_end);
slice_start = slice_end + c.len_utf8();
} else if c == '\n' || c == '\r' {
seen_newline = true;
ranges.push(slice_start..slice_end);
break; }
slice_end += c.len_utf8();
}
if !seen_newline {
ranges.push(slice_start..line.len())
}
Record {
line,
line_number,
ranges,
}
}
pub fn fields(&self) -> Vec<&str> {
let mut result = Vec::new();
for range in &self.ranges {
result.push(&self.line[range.clone()])
}
result
}
pub fn line(&self) -> &str {
&self.line
}
pub fn line_number(&self) -> usize {
self.line_number
}
pub fn len(&self) -> usize {
self.ranges.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use std::path::PathBuf;
use tempfile::{tempdir, TempDir};
const FOUR_COLUMN: &[u8] = b"line noise\n\nfoo\tbar\tbaz\tquux\nalpha\tbeta\tgamma\tdelta\n\nLeonardo\tMichelangelo\tDonatello\tRaphael\n#please ignore me\nred\tyellow\tgreen";
const UNICODE: &[u8] =
"ä line with Ünicöde symböls\tmøre wørds tø ræd\néverything îs strànge\t💣ℝ is it?\n"
.as_bytes();
const EMPTY: &[u8] = b"\t\t\tleft\t\t\tright\t\t\t";
fn setup(test_file_conents: &[u8]) -> (TempDir, PathBuf) {
let test_dir = tempdir().unwrap();
let test_file_path = test_dir.path().join("four_column.tsv");
let mut test_file = File::create(test_file_path.clone()).unwrap();
test_file.write(test_file_conents).unwrap();
(test_dir, test_file_path) }
#[test]
fn four_column() {
let (_test_dir, test_file_path) = setup(FOUR_COLUMN);
let tabfile = Tabfile::open(test_file_path)
.unwrap()
.comment_character('#')
.skip_lines(2)
.separator('\t');
let mut iterations = 0;
for (i, line) in tabfile.into_iter().enumerate() {
iterations += 1;
let record = line.unwrap();
let fields = record.fields();
match i {
0 => {
assert_eq!(record.line_number(), 3);
assert_eq!(fields[0], "foo");
assert_eq!(fields[1], "bar");
assert_eq!(fields[2], "baz");
assert_eq!(fields[3], "quux");
assert_eq!(record.line(), "foo\tbar\tbaz\tquux\n");
assert_eq!(record.len(), 4);
}
1 => {
assert_eq!(record.line_number(), 4);
assert_eq!(fields[0], "alpha");
assert_eq!(fields[1], "beta");
assert_eq!(fields[2], "gamma");
assert_eq!(fields[3], "delta");
assert_eq!(record.len(), 4);
}
2 => {
assert_eq!(record.line_number(), 6);
assert_eq!(fields[0], "Leonardo");
assert_eq!(fields[1], "Michelangelo");
assert_eq!(fields[2], "Donatello");
assert_eq!(fields[3], "Raphael");
assert_eq!(record.len(), 4);
}
3 => {
assert_eq!(record.line_number(), 8);
assert_eq!(fields[0], "red");
assert_eq!(fields[1], "yellow");
assert_eq!(fields[2], "green");
assert_eq!(record.line(), "red\tyellow\tgreen"); assert_eq!(record.len(), 3);
}
_ => assert!(false),
}
}
assert_eq!(iterations, 4);
}
#[test]
fn unicode() {
let (_test_dir, test_file_path) = setup(UNICODE);
let tabfile = Tabfile::open(test_file_path).unwrap();
let mut iterations = 0;
for (i, line) in tabfile.into_iter().enumerate() {
iterations += 1;
let record = line.unwrap();
let fields = record.fields();
match i {
0 => {
assert_eq!(record.line_number(), 1);
assert_eq!(fields[0], "ä line with Ünicöde symböls");
assert_eq!(fields[1], "møre wørds tø ræd");
assert_eq!(record.len(), 2);
}
1 => {
assert_eq!(record.line_number(), 2);
assert_eq!(fields[0], "éverything îs strànge");
assert_eq!(fields[1], "💣ℝ is it?");
assert_eq!(record.len(), 2);
}
_ => assert!(false),
}
}
assert_eq!(iterations, 2);
}
#[test]
fn empty() {
let (_test_dir, test_file_path) = setup(EMPTY);
let tabfile = Tabfile::open(test_file_path)
.unwrap()
.skip_empty_lines(false);
let mut iterations = 0;
for (i, line) in tabfile.into_iter().enumerate() {
iterations += 1;
let record = line.unwrap();
let fields = record.fields();
match i {
0 => {
assert_eq!(record.line_number(), 1);
assert_eq!(fields[0], "");
assert_eq!(fields[1], "");
assert_eq!(fields[2], "");
assert_eq!(fields[3], "left");
assert_eq!(fields[4], "");
assert_eq!(fields[5], "");
assert_eq!(fields[6], "right");
assert_eq!(fields[7], "");
assert_eq!(fields[8], "");
assert_eq!(fields[9], "");
assert_eq!(record.len(), 10);
}
_ => assert!(false),
}
}
assert_eq!(iterations, 1);
}
}