use super::histogram::Dataset;
use super::histogram::Histogram;
use super::Config;
use super::correlation_analysis::{statistical_ineff, autocorrelation_time};
use std::fs::OpenOptions;
use std::fs::File;
use std::io::prelude::*;
use std::io::{BufReader,BufWriter};
use k_B;
use std::path::Path;
use super::errors::*;
use f64;
fn get_relative_path(path1: &str, path2: &str) -> String {
let path1 = Path::new(path1);
path1.parent().unwrap().join(path2).to_str().unwrap().to_string()
}
pub fn vprintln(s: String, verbose: bool) {
if verbose {
println!("{}", s);
}
}
pub fn read_data(cfg: &Config) -> Result<Vec<Dataset>> {
let mut bias_pos: Vec<f64> = Vec::new();
let mut bias_fc: Vec<f64> = Vec::new();
let mut timeseries_lengths: Vec<usize> = Vec::new();
let mut paths = Vec::new();
let dataset_boundaries: Vec<(f64, f64)> = get_convdt_boundaries(cfg.start, cfg.end, cfg.convdt);
let num_datasets = dataset_boundaries.len();
let mut histograms = vec![Vec::new(); dataset_boundaries.len()];
let kT = cfg.temperature * k_B;
let bin_width: Vec<f64> = (0..cfg.dimens).map(|idx| {
(cfg.hist_max[idx] - cfg.hist_min[idx])/(cfg.num_bins[idx] as f64)
}).collect();
let num_bins: usize = cfg.num_bins.iter().product();
let dimens_length = cfg.num_bins.clone();
let f = File::open(&cfg.metadata_file).chain_err(|| "Failed to open metadata file")?;
let buf = BufReader::new(&f);
for (line_num,l) in buf.lines().enumerate() {
let line = l.chain_err(|| "Failed to read line")?;
if line.starts_with('#') || line.is_empty() {
continue;
}
let split: Vec<&str> = line.split_whitespace().collect();
if split.len() < 1 + cfg.dimens * 2 {
bail!(format!("Wrong number of columns in line {} of metadata file. Empty Line?", line_num+1));
}
for val in split.iter().skip(1).take(cfg.dimens) {
let pos = val.parse()
.chain_err(|| format!("Failed to read bias position in line {} of metadata file", line_num+1))?;
bias_pos.push(pos);
}
for val in split.iter().skip(1+cfg.dimens).take(cfg.dimens) {
let fc = val.parse()
.chain_err(|| format!("Failed to read bias fc in line {} of metadata file", line_num+1))?;
bias_fc.push(fc);
}
let path = get_relative_path(&cfg.metadata_file, split[0]);
paths.push(path.clone());
let (timeseries, timeseries_initial_lengths) = read_window_file(&path, cfg)
.chain_err(|| format!("Failed to read time series from {}", &path))?;
timeseries_lengths.push(timeseries_initial_lengths);
for (idx, interval) in dataset_boundaries.iter().enumerate() {
let (start, stop) = interval;
let timeseries_mask: Vec<bool> = (0..timeseries[0].len()).map(|i| {
is_in_time_boundaries(timeseries[0][i], *start, *stop)
}).collect();
let hist = build_histogram_from_timeseries(×eries, ×eries_mask, cfg);
histograms[idx].push(hist);
if (cfg.convdt == 0.00) || idx+1 == num_datasets {
vprintln(format!("{}, {} data points added.",
&path, histograms[idx].last().unwrap().num_points), cfg.verbose);
break
}
}
}
vprintln(format!("Generating {} datasets from histograms.", num_datasets), cfg.verbose);
let datasets: Vec<Dataset> = histograms.into_iter().enumerate().map(|(dataset_idx, dataset_histograms)| {
for (hs, path) in dataset_histograms.iter().zip(&paths) {
if hs.num_points == 0 {
let warning = format!("No data points for interval {}-{} in histogram boundaries: {}.",
dataset_boundaries[dataset_idx].0, dataset_boundaries[dataset_idx].1 ,&path);
if dataset_idx+1 == num_datasets {
let warning = warning + " This is the final dataset.";
if cfg.ignore_empty {
eprintln!("{}", warning);
} else {
bail!(warning);
}
} else {
eprintln!("{}", warning);
}
}
}
Ok(Dataset::new(num_bins, dimens_length.clone(), bin_width.clone(),
cfg.hist_min.clone(), cfg.hist_max.clone(), bias_pos.clone(),
bias_fc.clone(), kT, dataset_histograms, cfg.cyclic))
}).collect::<Result<Vec<Dataset>>>().chain_err(|| "Failed to create datasets.")?;
if datasets.is_empty() {
bail!("No datasets created.")
} else if datasets[0].histograms.is_empty() {
bail!("Dataset has no associated data points.")
} else {
if datasets.len() > 1 {
println!("Datasets:");
println!("Dataset\t\tTime interval\t\tWindows\t\tN_total");
for (idx, dataset) in datasets.iter().enumerate() {
let n: u32 = dataset.histograms.iter().map(|h| h.num_points).sum();
let mut stop = cfg.start+cfg.convdt*(idx+1) as f64;
if stop > cfg.end {
stop = cfg.end;
}
println!("{:?}\t\t{:?}-{:?}\t\t{:?}\t\t{:?}", idx+1, cfg.start, stop, dataset.histograms.len(), n);
}
}
let histograms = &datasets.last().unwrap().histograms;
if cfg.uncorr {
println!("Timeseries Correlation:");
println!("Window\t\tN\t\tN_uncorr\tN/N_uncorr");
for (idx, (n, h)) in timeseries_lengths.iter().zip(histograms.iter()).enumerate() {
println!("{:?}\t\t{:?}\t\t{:?}\t\t{:.2}",
idx+1, n, h.num_points, h.num_points as f64 / *n as f64);
}
let total_n = timeseries_lengths.iter().sum::<usize>() as f64;
let total_h = histograms.iter().map(|h| h.num_points).sum::<u32>() as f64;
println!("\t\t\t\t\tTotal:\t{:.2}", total_h/total_n);
}
Ok(datasets)
}
}
fn get_convdt_boundaries(start: f64, end: f64, convdt: f64) -> Vec<(f64, f64)> {
if convdt == 0.0 {
vec![(start, end)]
} else {
let intervals: usize = ((end - start) / convdt).ceil() as usize;
(1..intervals+1).map(|i| {
let interval_end = i as f64 * convdt + start;
if interval_end > end {
end
} else {
interval_end
}
}).map(|interval_end| { (start, interval_end) }).collect()
}
}
fn build_histogram_from_timeseries(timeseries: &[Vec<f64>], mask: &[bool],
cfg: &Config) -> Histogram {
let total_bins = cfg.num_bins.iter().product();
let bin_width: Vec<f64> = (0..cfg.dimens).map(|idx| {
(cfg.hist_max[idx] - cfg.hist_min[idx])/(cfg.num_bins[idx] as f64)
}).collect();
let mut hist = vec![0.0; total_bins];
for i in (0..timeseries[0].len()).filter(|i| mask[*i]) {
let mut values: Vec<f64> = vec![f64::NAN; cfg.dimens+1];
for j in 0..values.len() {
values[j] = timeseries[j][i];
}
if is_in_hist_boundaries(&values[1..], cfg) {
let bin_indeces: Vec<usize> = (0..cfg.dimens).map(|dimen: usize| {
let val = values[dimen+1];
((val - cfg.hist_min[dimen]) / bin_width[dimen]) as usize
}).collect();
let index = flat_index(&bin_indeces, &cfg.num_bins);
hist[index] += 1.0;
}
}
let num_points: f64 = hist.iter().sum();
Histogram::new(num_points as u32, hist)
}
fn flat_index(indeces: &[usize], lengths: &[usize]) -> usize {
indeces.iter().enumerate().map(|(i, idx)| {
idx * lengths.iter().take(i).product::<usize>()
}).sum()
}
fn is_in_hist_boundaries(values: &[f64], cfg: &Config) -> bool {
for dimen in 0..cfg.dimens {
if values[dimen] < cfg.hist_min[dimen] || values[dimen] >= cfg.hist_max[dimen] {
return false
}
}
true
}
fn is_in_time_boundaries(time: f64, start: f64, end: f64) -> bool {
if start <= time && time <= end {
return true
}
false
}
fn read_window_file(window_file: &str, cfg: &Config) -> Result<(Vec<Vec<f64>>, usize)> {
let mut timeseries: Vec<Vec<f64>> = read_timeseries(window_file, cfg)?;
let time_series_mask: Vec<bool> = timeseries[0].iter()
.map(|t| is_in_time_boundaries(*t, cfg.start, cfg.end)).collect();
timeseries = timeseries.into_iter().map(|ts| {
ts.into_iter().zip(time_series_mask.iter()).filter_map(|(val, mask)| {
if *mask {
Some(val)
} else {
None
}
}).collect()
}).collect::<Vec<Vec<f64>>>();
let timeseries_inital_length = timeseries[0].len();
if cfg.uncorr {
timeseries = uncorrelate(timeseries, cfg);
}
if timeseries[0].is_empty() && !cfg.ignore_empty {
bail!("Time series is empty")
}
Ok((timeseries, timeseries_inital_length))
}
fn read_timeseries(window_file: &str, cfg: &Config) -> Result<Vec<Vec<f64>>> {
let f = File::open(window_file)
.chain_err(|| format!("Failed to open sample data file {}.", window_file))?;
let mut buf = BufReader::new(&f);
let mut timeseries = vec![Vec::new(); cfg.dimens+1];
let mut line = String::new();
let mut linecount = 0;
while buf.read_line(&mut line).chain_err(|| "Failed to read line")? > 0 {
linecount += 1;
if line.starts_with('#') || line.starts_with('@') || line.is_empty() {
line.clear();
continue;
}
{
let split: Vec<&str> = line.split_whitespace().collect();
if split.len() < cfg.dimens+1 {
bail!(format!("Wrong number of columns in line {} of window file {}. Empty Line?.", linecount, window_file));
}
for i in 0..cfg.dimens+1 {
timeseries[i].push(split[i].parse::<f64>()
.chain_err(|| format!("Failed to parse line {} of window file {}.", linecount, window_file))?
);
}
}
line.clear();
}
Ok(timeseries)
}
fn uncorrelate(timeseries: Vec<Vec<f64>>, cfg: &Config) -> Vec<Vec<f64>> {
let gs: Vec<f64> = timeseries[1..].iter().map(|ts| statistical_ineff(ts)).collect();
let mut max_g = 1.0;
for g in gs {
if g > max_g {
max_g = g;
}
}
let mut trunc_g = max_g.trunc() as usize;
if (trunc_g as f64 - max_g).abs() > 0.000_000_000_1 {
trunc_g += 1;
}
let prev_len = timeseries[0].len();
let timeseries = timeseries.into_iter().map(|ts| {
ts.into_iter().step_by(trunc_g).collect::<Vec<f64>>()
}).collect::<Vec<Vec<f64>>>();
let new_len = timeseries[0].len();
if cfg.verbose {
let tau = autocorrelation_time(max_g)* (timeseries[0][1]-timeseries[0][0]);
vprintln(format!("{:?}/{:?} samples are uncorrelated. {:?} samples removed from timeseries (tau={:.5})", new_len, prev_len, prev_len-new_len, tau), true);
}
timeseries
}
pub fn write_results(out_file: &str, append: bool, ds: &Dataset, free: &[f64],
free_std: &[f64], prob: &[f64], prob_std: &[f64], index: Option<usize>) -> Result<()> {
if !append && Path::new(out_file).exists() {
std::fs::remove_file(out_file).chain_err(|| "Failed to delete file.")?;
}
let output = OpenOptions::new().write(true)
.append(true)
.create(true)
.open(out_file)
.chain_err(|| format!("Failed to create file with path {}", out_file))?;
let mut buf = BufWriter::new(output);
let header: String = (0..ds.dimens_lengths.len()).map(|d| format!("coord{}", d+1))
.collect::<Vec<String>>().join(" ");
if let Some(index) = index {
writeln!(buf, "#Dataset {}", index).unwrap();
}
writeln!(buf, "#{} Free Energy +/- Probability +/-", header).unwrap();
for bin in 0..free.len() {
let coords = ds.get_coords_for_bin(bin);
let coords_str: String = coords.iter().map(|c| {format!("{:8.6} ", c)})
.collect::<Vec<String>>().join("\t");
writeln!(buf, "{}{:8.6} {:8.6} {:8.6} {:8.6}", coords_str,
free[bin], free_std[bin], prob[bin], prob_std[bin])
.chain_err(|| "Failed to write to file.")?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use assert_approx_eq::assert_approx_eq;
fn cfg() -> Config {
Config {
metadata_file: "example/1d_cyclic/metadata.dat".to_string(),
hist_min: vec![-3.14],
hist_max: vec![3.14],
num_bins: vec![10],
dimens: 1,
verbose: false,
tolerance: 0.0,
max_iterations: 0,
temperature: 300.0,
cyclic: false,
output: "qwert".to_string(),
bootstrap: 0,
bootstrap_seed: 1234,
start: 0.0,
end: 1e+20,
uncorr: false,
convdt: 0.0,
ignore_empty: false,
}
}
#[test]
fn read_window_file() {
let f = "example/1d_cyclic/COLVAR+0.0.xvg";
let cfg = cfg();
let (timeseries, timeseries_inital_length) = super::read_window_file(&f, &cfg).unwrap();
let mask = vec![true; timeseries[0].len()];
let h = build_histogram_from_timeseries(×eries, &mask, &cfg);
println!("{:?}", h);
assert_eq!(5000, timeseries_inital_length);
assert_eq!(5000, h.num_points);
assert_approx_eq!(0.0, h.bins[2]);
assert_approx_eq!(11.0, h.bins[3]);
assert_approx_eq!(2236.0, h.bins[4]);
assert_approx_eq!(2714.0, h.bins[5]);
assert_approx_eq!(39.0, h.bins[6]);
assert_approx_eq!(0.0, h.bins[7]);
}
#[test]
fn read_timeseries() {
let f = "example/1d_cyclic/COLVAR+0.0.xvg";
let cfg = cfg();
let ts = super::read_timeseries(&f, &cfg).unwrap();
let expected = [
-0.153_145,
-0.377_860,
0.010_992,
0.123_074,
0.108_291,
0.261_607,
];
assert!(ts.len() == 2);
assert!(ts[0].len() == 5000);
println!("{:?}", ts);
for (actual, expected) in ts[1].iter().zip(expected.iter()) {
assert!((actual-expected).abs() < 0.001, "{:?} != {:?}", actual, expected);
}
}
#[test]
fn read_data() {
let cfg = cfg();
let ds = &super::read_data(&cfg).unwrap()[0];
println!("{:?}", ds);
assert_eq!(25, ds.num_windows);
assert_eq!(cfg.num_bins.len(), ds.dimens_lengths.len());
assert_eq!(cfg.num_bins[0], ds.dimens_lengths[0]);
assert_approx_eq!(cfg.temperature * k_B, ds.kT);
assert_eq!(25, ds.histograms.len())
}
#[test]
fn read_data_empty() {
let mut cfg = cfg();
cfg.metadata_file = "tests/data/metadata_convdt.dat".to_string();
cfg.start = 2.5;
cfg.end = 9.0;
cfg.ignore_empty = false;
let ds = super::read_data(&cfg);
if ds.is_ok() {
panic!()
}
cfg.ignore_empty = true;
let ds = super::read_data(&cfg);
if ds.is_err() {
panic!()
}
}
#[test]
fn read_data_convdt() {
let mut cfg = cfg();
cfg.metadata_file = "tests/data/metadata_convdt.dat".to_string();
cfg.convdt = 2.0;
cfg.start = 0.0;
cfg.end = 9.0;
let dss = super::read_data(&cfg).unwrap();
assert_eq!(5, dss.len());
for ds in &dss {
assert_eq!(6, ds.num_windows);
assert_eq!(6, ds.histograms.len());
}
let hist_points: Vec<u32> = dss.iter().map(|ds| {
ds.histograms.iter().map(|h| h.num_points).sum()
}).collect();
let expected_hist_points = vec![
300, 600, 900, 1200, 1350, ];
for (expected, actual) in expected_hist_points.iter().zip(hist_points.iter()) {
assert_eq!(expected, actual);
}
}
#[test]
fn read_data_convdt_single() {
let mut cfg = cfg();
cfg.metadata_file = "tests/data/metadata_convdt_single.dat".to_string();
cfg.convdt = 2.0;
cfg.start = 0.0;
cfg.end = 9.0;
let dss = super::read_data(&cfg).unwrap();
assert_eq!(5, dss.len());
for ds in &dss {
assert_eq!(1, ds.num_windows);
assert_eq!(1, ds.histograms.len());
}
let hist_points: Vec<u32> = dss.iter().map(|ds| {
ds.histograms.iter().map(|h| h.num_points).sum()
}).collect();
let expected_hist_points = vec![
0, 100, 200, 250, 250, ];
for (expected, actual) in expected_hist_points.iter().zip(hist_points.iter()) {
assert_eq!(expected, actual);
}
}
#[test]
fn get_relative_path() {
let path1 = "path/to/some_file.dat";
let path2 = "another_file.dat";
let path3 = "subfolder/another_file.dat";
let relative2 = super::get_relative_path(&path1, &path2);
assert_eq!("path/to/another_file.dat" ,relative2);
let relative3 = super::get_relative_path(&path1, &path3);
assert_eq!("path/to/subfolder/another_file.dat" ,relative3);
}
#[test]
fn is_in_time_boundaries() {
let start = 10.0;
let end = 20.0;
assert!(super::is_in_time_boundaries(15.0, start, end));
assert!(super::is_in_time_boundaries(10.0, start, end));
assert!(super::is_in_time_boundaries(20.0, start, end));
assert!(!super::is_in_time_boundaries(9.9999999, start, end));
assert!(!super::is_in_time_boundaries(20.000001, start, end));
}
#[test]
fn get_convdt_boundaries() {
let test = super::get_convdt_boundaries(10.0, 20.0, 10.0);
println!("{:?}", test);
assert!(test.len() == 1);
assert_approx_eq!(test[0].0, 10.0);
assert_approx_eq!(test[0].1, 20.0);
let test = super::get_convdt_boundaries(10.0, 20.0, 5.0);
println!("{:?}", test);
assert!(test.len() == 2);
assert_approx_eq!(test[0].0, 10.0);
assert_approx_eq!(test[0].1, 15.0);
assert_approx_eq!(test[1].0, 10.0);
assert_approx_eq!(test[1].1, 20.0);
let test = super::get_convdt_boundaries(5.0, 30.0, 10.0);
println!("{:?}", test);
assert!(test.len() == 3);
assert_approx_eq!(test[0].0, 5.0);
assert_approx_eq!(test[0].1, 15.0);
assert_approx_eq!(test[1].0, 5.0);
assert_approx_eq!(test[1].1, 25.0);
assert_approx_eq!(test[2].0, 5.0);
assert_approx_eq!(test[2].1, 30.0);
}
}