use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};
use anyhow::Context;
use clap::Args;
use biodream::{CsvOptions, ReadOptions, TimeFormat};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, clap::ValueEnum)]
pub enum CliTimeFormat {
#[default]
Seconds,
Milliseconds,
Hms,
}
impl From<CliTimeFormat> for TimeFormat {
fn from(f: CliTimeFormat) -> Self {
match f {
CliTimeFormat::Seconds => Self::Seconds,
CliTimeFormat::Milliseconds => Self::Milliseconds,
CliTimeFormat::Hms => Self::Hms,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
pub enum OutputFormat {
Csv,
Arrow,
Parquet,
Hdf5,
Mat,
}
#[derive(Debug, Args)]
pub struct ConvertArgs {
#[arg(value_name = "FILE")]
pub path: PathBuf,
#[arg(short, long, value_name = "OUTPUT")]
pub output: Option<PathBuf>,
#[arg(short = 'F', long, value_name = "FORMAT")]
pub format: Option<OutputFormat>,
#[arg(long, value_delimiter = ',', value_name = "INDEX", conflicts_with_all = ["channel_name", "channel_contains"])]
pub channels: Option<Vec<usize>>,
#[arg(long = "channel-name", value_name = "NAME", conflicts_with_all = ["channels", "channel_contains"])]
pub channel_name: Option<Vec<String>>,
#[arg(long = "channel-contains", value_name = "NEEDLE", conflicts_with_all = ["channels", "channel_name"])]
pub channel_contains: Option<String>,
#[arg(long)]
pub scaled: bool,
#[arg(long, value_enum, default_value_t = CliTimeFormat::Seconds, value_name = "FMT")]
pub time_format: CliTimeFormat,
#[arg(long, default_value_t = 6, value_name = "N")]
pub precision: usize,
#[arg(long, default_value = ",", value_name = "CHAR")]
pub delimiter: String,
#[arg(long)]
pub include_raw: bool,
#[arg(long, default_value = "", value_name = "STR")]
pub fill_value: String,
}
pub fn run(args: ConvertArgs) -> anyhow::Result<()> {
let format = resolve_format(args.format, &args.path, args.output.as_deref())?;
let csv_opts = build_csv_options(&args)?;
let output_path = resolve_output_path(args.output, &args.path, format)?;
let channel_indices = resolve_channel_indices(
args.channels.as_deref(),
args.channel_name.as_deref(),
args.channel_contains.as_deref(),
&args.path,
)?;
let result = read_with_options(&args.path, channel_indices.as_deref(), args.scaled)?;
let df = result.value;
write_output(format, &df, &output_path, &csv_opts)
}
fn resolve_channel_indices(
by_index: Option<&[usize]>,
by_name: Option<&[String]>,
by_contains: Option<&str>,
path: &Path,
) -> anyhow::Result<Option<Vec<usize>>> {
if let Some(indices) = by_index {
return Ok(Some(indices.to_vec()));
}
if by_name.is_none() && by_contains.is_none() {
return Ok(None);
}
if path == Path::new("-") {
anyhow::bail!(
"--channel-name and --channel-contains require a file path; \
cannot resolve channel names from stdin"
);
}
let lazy =
biodream::open_file(path).with_context(|| format!("failed to open {}", path.display()))?;
if let Some(names) = by_name {
let mut indices = Vec::with_capacity(names.len());
for name in names {
let idx = lazy.find_channel_by_name(name).ok_or_else(|| {
let available = lazy
.channel_metadata
.iter()
.map(|m| m.name.as_str())
.collect::<Vec<_>>()
.join(", ");
anyhow::anyhow!("no channel named {name:?} (available: {available})")
})?;
indices.push(idx);
}
return Ok(Some(indices));
}
if let Some(needle) = by_contains {
let idx = lazy.find_channel_containing(needle).ok_or_else(|| {
let available = lazy
.channel_metadata
.iter()
.map(|m| m.name.as_str())
.collect::<Vec<_>>()
.join(", ");
anyhow::anyhow!("no channel containing {needle:?} (available: {available})")
})?;
return Ok(Some(vec![idx]));
}
Ok(None)
}
fn parse_delimiter(s: &str) -> anyhow::Result<u8> {
if s == "tab" || s == "\\t" || s == "\t" {
return Ok(b'\t');
}
let mut chars = s.chars();
match (chars.next(), chars.next()) {
(Some(c), None) if c.is_ascii() => Ok(c as u8),
_ => anyhow::bail!("--delimiter must be a single ASCII character or 'tab'; got {s:?}"),
}
}
fn build_csv_options(args: &ConvertArgs) -> anyhow::Result<CsvOptions> {
let delimiter = parse_delimiter(&args.delimiter)?;
Ok(CsvOptions::new()
.delimiter(delimiter)
.precision(args.precision)
.time_format(TimeFormat::from(args.time_format))
.include_raw(args.include_raw)
.fill_value(args.fill_value.clone()))
}
fn read_with_options(
path: &Path,
channel_indices: Option<&[usize]>,
scaled: bool,
) -> anyhow::Result<biodream::ParseResult<biodream::Datafile>> {
let mut opts = ReadOptions::new().scaled(scaled);
if let Some(indices) = channel_indices {
opts = opts.channels(indices);
}
if path == Path::new("-") {
use std::io::Read;
let mut bytes = Vec::new();
std::io::stdin()
.read_to_end(&mut bytes)
.context("failed to read from stdin")?;
opts.read_bytes(&bytes)
.context("failed to parse .acq from stdin")
} else {
opts.read_file(path)
.with_context(|| format!("failed to read {}", path.display()))
}
}
pub fn resolve_format(
explicit: Option<OutputFormat>,
input: &Path,
output: Option<&Path>,
) -> anyhow::Result<OutputFormat> {
if let Some(f) = explicit {
return Ok(f);
}
let ext_path = output.unwrap_or(input);
let ext = ext_path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
match ext.as_str() {
"csv" | "txt" | "tsv" => Ok(OutputFormat::Csv),
"arrow" | "ipc" => Ok(OutputFormat::Arrow),
"parquet" | "pq" => Ok(OutputFormat::Parquet),
"h5" | "hdf5" => Ok(OutputFormat::Hdf5),
"mat" => Ok(OutputFormat::Mat),
_ if output.is_none() => Ok(OutputFormat::Csv),
other => anyhow::bail!(
"cannot infer output format from extension '.{other}'; \
use --format csv|arrow|parquet|hdf5|mat"
),
}
}
fn resolve_output_path(
explicit: Option<PathBuf>,
input: &Path,
format: OutputFormat,
) -> anyhow::Result<PathBuf> {
if let Some(p) = explicit {
return Ok(p);
}
if input == Path::new("-") {
anyhow::bail!("--output is required when reading from stdin");
}
let ext = match format {
OutputFormat::Csv => "csv",
OutputFormat::Arrow => "arrow",
OutputFormat::Parquet => "parquet",
OutputFormat::Hdf5 => "h5",
OutputFormat::Mat => "mat",
};
let stem = input
.file_prefix()
.unwrap_or_else(|| std::ffi::OsStr::new("output"));
Ok(PathBuf::from(stem).with_extension(ext))
}
fn write_output(
format: OutputFormat,
df: &biodream::Datafile,
output_path: &Path,
csv_opts: &CsvOptions,
) -> anyhow::Result<()> {
match format {
OutputFormat::Csv => write_csv(df, output_path, csv_opts),
OutputFormat::Arrow => write_arrow(df, output_path),
OutputFormat::Parquet => write_parquet(df, output_path),
OutputFormat::Hdf5 => write_hdf5(df, output_path),
OutputFormat::Mat => write_mat(df, output_path),
}
}
fn write_csv(
df: &biodream::Datafile,
output_path: &Path,
csv_opts: &CsvOptions,
) -> anyhow::Result<()> {
let file = File::create(output_path)
.with_context(|| format!("failed to create {}", output_path.display()))?;
let mut w = BufWriter::new(file);
biodream::to_csv(df, &mut w, csv_opts).context("CSV export failed")?;
w.flush().context("flush failed")
}
#[cfg(any(feature = "arrow", feature = "parquet"))]
fn write_arrow(df: &biodream::Datafile, output_path: &Path) -> anyhow::Result<()> {
let file = File::create(output_path)
.with_context(|| format!("failed to create {}", output_path.display()))?;
let mut writer = BufWriter::new(file);
biodream::to_arrow_ipc(df, &mut writer).context("Arrow IPC export failed")?;
writer.flush().context("flush failed")
}
#[cfg(not(any(feature = "arrow", feature = "parquet")))]
fn write_arrow(_df: &biodream::Datafile, _output_path: &Path) -> anyhow::Result<()> {
anyhow::bail!(
"Arrow IPC export requires the 'arrow' feature; \
recompile with: cargo build --features arrow"
)
}
#[cfg(feature = "parquet")]
fn write_parquet(df: &biodream::Datafile, output_path: &Path) -> anyhow::Result<()> {
let file = File::create(output_path)
.with_context(|| format!("failed to create {}", output_path.display()))?;
let writer = BufWriter::new(file);
biodream::to_parquet(df, writer, &biodream::ParquetOptions::default())
.context("Parquet export failed")
}
#[cfg(not(feature = "parquet"))]
fn write_parquet(_df: &biodream::Datafile, _output_path: &Path) -> anyhow::Result<()> {
anyhow::bail!(
"Parquet export requires the 'parquet' feature; \
recompile with: cargo build --features parquet"
)
}
#[cfg(feature = "hdf5")]
fn write_hdf5(df: &biodream::Datafile, output_path: &Path) -> anyhow::Result<()> {
biodream::to_hdf5(df, output_path, &biodream::Hdf5Options::default())
.with_context(|| format!("HDF5 export failed: {}", output_path.display()))
}
#[cfg(not(feature = "hdf5"))]
fn write_hdf5(_df: &biodream::Datafile, _output_path: &Path) -> anyhow::Result<()> {
anyhow::bail!(
"HDF5 export requires the 'hdf5' feature; \
recompile with: cargo build --features hdf5"
)
}
fn write_mat(df: &biodream::Datafile, output_path: &Path) -> anyhow::Result<()> {
write_hdf5(df, output_path)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
#[test]
fn resolve_format_explicit_csv() -> anyhow::Result<()> {
let f = resolve_format(Some(OutputFormat::Csv), Path::new("a.acq"), None)?;
assert_eq!(f, OutputFormat::Csv);
Ok(())
}
#[test]
fn resolve_format_from_output_extension() -> anyhow::Result<()> {
let f = resolve_format(None, Path::new("a.acq"), Some(Path::new("out.parquet")))?;
assert_eq!(f, OutputFormat::Parquet);
Ok(())
}
#[test]
fn resolve_format_hdf5_extensions() -> anyhow::Result<()> {
let a = resolve_format(None, Path::new("a.acq"), Some(Path::new("out.h5")))?;
let b = resolve_format(None, Path::new("a.acq"), Some(Path::new("out.hdf5")))?;
assert_eq!(a, OutputFormat::Hdf5);
assert_eq!(b, OutputFormat::Hdf5);
Ok(())
}
#[test]
fn resolve_format_mat_extension() -> anyhow::Result<()> {
let f = resolve_format(None, Path::new("a.acq"), Some(Path::new("out.mat")))?;
assert_eq!(f, OutputFormat::Mat);
Ok(())
}
#[test]
fn resolve_format_arrow_from_ipc_extension() -> anyhow::Result<()> {
let f = resolve_format(None, Path::new("a.acq"), Some(Path::new("out.ipc")))?;
assert_eq!(f, OutputFormat::Arrow);
Ok(())
}
#[test]
fn resolve_format_defaults_to_csv_when_no_output() -> anyhow::Result<()> {
let f = resolve_format(None, Path::new("data.acq"), None)?;
assert_eq!(f, OutputFormat::Csv);
Ok(())
}
#[test]
fn resolve_format_unknown_extension_returns_error() {
let r = resolve_format(None, Path::new("a.acq"), Some(Path::new("out.xyz")));
assert!(r.is_err());
}
#[test]
fn resolve_output_path_derives_csv_from_input() -> anyhow::Result<()> {
let p = resolve_output_path(None, Path::new("data.acq"), OutputFormat::Csv)?;
assert_eq!(p, PathBuf::from("data.csv"));
Ok(())
}
#[test]
fn resolve_output_path_derives_arrow_from_input() -> anyhow::Result<()> {
let p = resolve_output_path(None, Path::new("my.data.acq"), OutputFormat::Arrow)?;
assert_eq!(p, PathBuf::from("my.arrow"));
Ok(())
}
#[test]
fn resolve_output_path_stdin_without_explicit_errors() {
let r = resolve_output_path(None, Path::new("-"), OutputFormat::Csv);
assert!(r.is_err());
}
#[test]
fn resolve_output_path_derives_hdf5_from_input() -> anyhow::Result<()> {
let p = resolve_output_path(None, Path::new("data.acq"), OutputFormat::Hdf5)?;
assert_eq!(p, PathBuf::from("data.h5"));
Ok(())
}
#[test]
fn resolve_output_path_derives_mat_from_input() -> anyhow::Result<()> {
let p = resolve_output_path(None, Path::new("data.acq"), OutputFormat::Mat)?;
assert_eq!(p, PathBuf::from("data.mat"));
Ok(())
}
}