use flate2::read;
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fs::File;
use std::io::{self, BufRead, BufReader, ErrorKind};
use std::path::Path;
use crate::{DataMatrix, Error};
#[derive(Debug, Clone)]
pub struct DataMatrixBuilder {
row_label_col: usize,
col_label_col: usize,
data_col: usize,
row_idx_col: Option<usize>,
col_idx_col: Option<usize>,
separator: Option<char>,
symmetric: bool,
skip_header: bool,
labels: Option<Vec<String>>,
}
#[allow(clippy::new_without_default)]
impl DataMatrixBuilder {
pub fn new() -> Self {
Self {
row_label_col: 0,
col_label_col: 1,
data_col: 2,
row_idx_col: None,
col_idx_col: None,
separator: None,
symmetric: false,
skip_header: false,
labels: None,
}
}
pub fn label_columns(mut self, row: usize, col: usize) -> Self {
self.row_label_col = row;
self.col_label_col = col;
self
}
pub fn labels<I, S>(mut self, labels: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.labels = Some(labels.into_iter().map(Into::into).collect());
self
}
pub fn data_column(mut self, val: usize) -> Self {
self.data_col = val;
self
}
pub fn index_columns(mut self, row_idx: usize, col_idx: usize) -> Self {
self.row_idx_col = Some(row_idx);
self.col_idx_col = Some(col_idx);
self
}
pub fn separator(mut self, sep: char) -> Self {
self.separator = Some(sep);
self
}
pub fn skip_header(mut self, if_header: bool) -> Self {
self.skip_header = if_header;
self
}
pub fn symmetric(mut self, if_symmetric: bool) -> Self {
self.symmetric = if_symmetric;
self
}
pub fn from_data(self, data: &[f64]) -> Result<DataMatrix, Error> {
let len = data.len();
let n = (len as f64).sqrt() as usize;
if n * n != len {
return Err(Error::WrongNumberOfData { n_data: len });
}
let (row_labels, col_labels) = match &self.labels {
Some(given) => (given.clone(), given.clone()),
None => {
let rows = (0..n).map(|i| format!("row-{}", i + 1)).collect();
let cols = (0..n).map(|i| format!("col-{}", i + 1)).collect();
(rows, cols)
}
};
let mut matrix = Vec::with_capacity(n);
for i in 0..n {
let start = i * n;
let end = start + n;
matrix.push(data[start..end].to_vec());
}
DataMatrix::new(matrix, row_labels, col_labels)
}
pub fn from_file<P: AsRef<Path>>(self, filename: P) -> Result<DataMatrix, Error> {
if let Some(ref labels) = self.labels {
return self.read_one_column(filename, self.data_col, labels.clone());
}
let mut row_indexer = Indexer::new();
let mut col_indexer = Indexer::new();
let separator = match self.separator {
None => guess_separator(&filename),
Some(c) => c,
};
let lines = parse_plain(filename, separator, self.skip_header)?;
if let (Some(r_idx), Some(c_idx)) = (self.row_idx_col, self.col_idx_col) {
for (line_no, parts) in lines.iter().enumerate() {
let row_idx: usize = parts[r_idx].parse().map_err(|_| Error::ParseError {
line: line_no,
content: parts[r_idx].to_string(),
})?;
let col_idx: usize = parts[c_idx].parse().map_err(|_| Error::ParseError {
line: line_no,
content: parts[c_idx].to_string(),
})?;
row_indexer.add_explicit(&parts[self.row_label_col], row_idx);
if self.symmetric {
row_indexer.add_explicit(&parts[self.col_label_col], col_idx);
} else {
col_indexer.add_explicit(&parts[self.col_label_col], col_idx);
}
}
} else {
for parts in &lines {
row_indexer.add(&parts[self.row_label_col]);
if self.symmetric {
row_indexer.add(&parts[self.col_label_col]);
} else {
col_indexer.add(&parts[self.col_label_col]);
}
}
}
if self.symmetric {
col_indexer = row_indexer.clone();
}
let mut data = vec![vec![0.0; col_indexer.max_index()]; row_indexer.max_index()];
let row_labels = row_indexer.to_vec();
let col_labels = col_indexer.to_vec();
for (line_no, parts) in lines.into_iter().enumerate() {
let i_row = row_indexer.index(&parts[self.row_label_col]);
let j_col = col_indexer.index(&parts[self.col_label_col]);
let value: f64 = parts[self.data_col]
.parse()
.map_err(|_| Error::ParseError {
line: line_no,
content: parts[self.data_col].to_string()
})?;
data[i_row][j_col] = value;
if self.symmetric {
data[j_col][i_row] = value;
}
}
DataMatrix::new(data, row_labels, col_labels)
}
fn read_one_column<P: AsRef<Path>>(
&self,
filename: P,
column: usize,
labels: Vec<String>,
) -> Result<DataMatrix, Error> {
let rows = parse_plain(filename, ' ', self.skip_header)?;
let col_idx = column;
let mut values = Vec::new();
for (line_num, parts) in rows.into_iter().enumerate() {
if col_idx >= parts.len() {
return Err(Error::NotEnoughColumns {
line: line_num + 1,
needed: col_idx + 1,
content: format!("{:?}", parts),
});
}
let value: f64 = parts[col_idx].parse().map_err(|_| Error::ParseError {
line: line_num + 1,
content: parts[col_idx].clone(),
})?;
values.push(value);
}
let n = labels.len();
if n * n != values.len() {
return Err(Error::ParseError {
line: 0,
content: format!(
"Expected {}² = {} values, but found {}",
n,
n * n,
values.len()
),
});
}
let mut data = Vec::with_capacity(n);
for i in 0..n {
let start = i * n;
let end = start + n;
data.push(values[start..end].to_vec());
}
DataMatrix::new(data, labels.clone(), labels)
}
}
fn parse_plain<P: AsRef<Path>>(
filename: P,
separator: char,
skip_header: bool,
) -> std::io::Result<Vec<Vec<String>>> {
let reader = open_file(filename)?;
let mut first_passed = false;
let mut lines = Vec::new();
for line in reader.lines() {
let line = line?;
if line.trim().is_empty() || line.starts_with('#') {
continue;
}
if !first_passed && skip_header {
first_passed = true;
continue;
}
let parts: Vec<String> = if separator == ' ' {
line.split_whitespace().map(|s| s.to_string()).collect()
} else {
line.split(separator).map(|s| s.to_string()).collect()
};
lines.push(parts);
}
Ok(lines)
}
#[derive(Clone)]
struct Indexer {
label_to_index: HashMap<String, usize>,
}
impl Indexer {
fn new() -> Self {
Self {
label_to_index: HashMap::new(),
}
}
fn add(&mut self, label: &str) -> usize {
if let Some(&idx) = self.label_to_index.get(label) {
idx
} else {
let idx = self.label_to_index.len();
self.label_to_index.insert(label.to_string(), idx);
idx
}
}
fn add_explicit(&mut self, label: &str, idx: usize) {
self.label_to_index.entry(label.to_string()).or_insert(idx);
}
fn index(&self, label: &str) -> usize {
*self
.label_to_index
.get(label)
.expect("Label not found in indexer")
}
fn max_index(&self) -> usize {
self.label_to_index.len()
}
fn to_vec(&self) -> Vec<String> {
let mut result = vec!["".to_string(); self.label_to_index.len()];
for (label, &idx) in &self.label_to_index {
result[idx] = label.clone();
}
result
}
}
fn guess_separator<P: AsRef<Path>>(path: P) -> char {
let path = path.as_ref();
let ext = match path.extension().and_then(|e| e.to_str()) {
Some(ext) => {
let ext = ext.to_ascii_lowercase();
match ext.as_str() {
"gz" | "bz2" | "xz" | "zst" | "zip" => {
path.file_stem()
.and_then(|s| Path::new(s).extension())
.and_then(|e| e.to_str())
.map(|e| e.to_ascii_lowercase())
.unwrap_or_default()
}
other => other.to_string(),
}
}
None => String::new(),
};
match ext.as_str() {
"dat" => ' ',
"csv" => ',',
"tsv" | "tab" => '\t',
"psv" => '|',
"ssv" => ';',
_ => ' ',
}
}
fn open_file<P: AsRef<Path>>(file_path: P) -> io::Result<Box<dyn BufRead>> {
let path = file_path.as_ref();
if path.as_os_str().is_empty() {
return Err(io::Error::new(
ErrorKind::InvalidInput,
"Couldn't open file: empty path",
));
}
let file = File::open(path)?;
if file_path.as_ref().extension() == Some(OsStr::new("gz")) {
Ok(Box::new(BufReader::with_capacity(
128 * 1024,
read::GzDecoder::new(file),
)))
} else {
Ok(Box::new(BufReader::with_capacity(128 * 1024, file)))
}
}