use std::io::Write;
use std::path::PathBuf;
use anyhow::{Context, Result};
use clap::{Args, Subcommand, ValueEnum};
use log::info;
#[derive(Args)]
pub struct ConvertArgs {
#[command(subcommand)]
pub command: ConvertCommand,
}
#[derive(Subcommand)]
pub enum ConvertCommand {
Gbk(ConvertGbkArgs),
Clusters(ConvertClustersArgs),
}
#[derive(Debug, Clone, ValueEnum)]
pub enum GbkFormat {
Bigslice,
Fna,
Faa,
}
#[derive(Debug, Clone, ValueEnum)]
pub enum ClusterFormat {
Gff,
}
#[derive(Args)]
pub struct ConvertGbkArgs {
#[arg(short, long)]
pub input_dir: PathBuf,
#[arg(short, long)]
pub output_dir: Option<PathBuf>,
#[arg(short, long)]
pub format: GbkFormat,
}
#[derive(Args)]
pub struct ConvertClustersArgs {
#[arg(short, long)]
pub input_dir: PathBuf,
#[arg(short, long)]
pub output_dir: Option<PathBuf>,
#[arg(short, long)]
pub format: ClusterFormat,
}
impl ConvertArgs {
pub fn execute(&self) -> Result<()> {
match &self.command {
ConvertCommand::Gbk(args) => convert_gbk(args),
ConvertCommand::Clusters(args) => convert_clusters(args),
}
}
}
fn convert_gbk(args: &ConvertGbkArgs) -> Result<()> {
let out_dir = args.output_dir.as_ref().unwrap_or(&args.input_dir);
std::fs::create_dir_all(out_dir)?;
info!(
"Converting GenBank files from {:?} to {:?} ({:?})",
args.input_dir, out_dir, args.format
);
let entries: Vec<_> = std::fs::read_dir(&args.input_dir)?
.filter_map(|e| e.ok())
.filter(|e| {
let path = e.path();
let ext_ok = path.extension().map(|ext| ext == "gbk").unwrap_or(false);
if !ext_ok {
return false;
}
path.file_name()
.and_then(|n| n.to_str())
.map(|n| n.contains("_cluster_"))
.unwrap_or(false)
})
.collect();
let mut done = 0usize;
for entry in &entries {
let path = entry.path();
let stem = path.file_stem().unwrap_or_default().to_string_lossy();
let gbk_data = std::fs::read(&path)?;
if !gbk_data
.windows(b"GECCO-Data".len())
.any(|w| w == b"GECCO-Data")
{
log::warn!("GenBank file {:?} was not obtained by GECCO", path);
continue;
}
let seqs: Vec<_> = gb_io::reader::SeqReader::new(&gbk_data[..])
.filter_map(|r| r.ok())
.collect();
match args.format {
GbkFormat::Fna => {
let out_path = out_dir.join(format!("{}.fna", stem));
let mut out = std::fs::File::create(&out_path)?;
for seq in &seqs {
let name = seq.name.as_deref().unwrap_or("unknown");
writeln!(out, ">{}", name)?;
let dna = String::from_utf8_lossy(&seq.seq);
for chunk in dna.as_bytes().chunks(80) {
out.write_all(chunk)?;
writeln!(out)?;
}
}
info!("Wrote {:?}", out_path);
}
GbkFormat::Faa => {
let out_path = out_dir.join(format!("{}.faa", stem));
let mut out = std::fs::File::create(&out_path)?;
for seq in &seqs {
for feat in &seq.features {
if feat.kind != "CDS" {
continue;
}
let Some(locus_tag) = feat.qualifier_values("locus_tag").next() else {
continue;
};
let translation = feat
.qualifier_values("translation")
.next()
.with_context(|| {
format!(
"CDS {locus_tag:?} in {path:?} has no translation qualifier"
)
})?;
writeln!(out, ">{}", locus_tag)?;
for chunk in translation.as_bytes().chunks(80) {
out.write_all(chunk)?;
writeln!(out)?;
}
}
}
info!("Wrote {:?}", out_path);
}
GbkFormat::Bigslice => {
let out_path = out_dir.join(format!("{}.region001.gbk", stem));
std::fs::copy(&path, &out_path)?;
info!("Wrote {:?}", out_path);
}
}
done += 1;
}
info!("Converted {} file(s)", done);
Ok(())
}
fn convert_clusters(args: &ConvertClustersArgs) -> Result<()> {
let out_dir = args.output_dir.as_ref().unwrap_or(&args.input_dir);
std::fs::create_dir_all(out_dir)?;
info!(
"Converting cluster tables from {:?} to {:?} ({:?})",
args.input_dir, out_dir, args.format
);
let entries: Vec<_> = std::fs::read_dir(&args.input_dir)?
.filter_map(|e| e.ok())
.filter(|e| e.path().to_string_lossy().ends_with(".clusters.tsv"))
.collect();
for entry in &entries {
let path = entry.path();
let full_name = path.to_string_lossy();
let base = full_name
.strip_suffix(".clusters.tsv")
.unwrap_or(&full_name);
let base_name = std::path::Path::new(base)
.file_name()
.unwrap_or_default()
.to_string_lossy();
match args.format {
ClusterFormat::Gff => {
let out_path = out_dir.join(format!("{}.clusters.gff", base_name));
let rdr =
std::fs::File::open(&path).with_context(|| format!("opening {:?}", path))?;
let mut tsv_rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_reader(rdr);
let mut out = std::fs::File::create(&out_path)?;
writeln!(out, "##gff-version 3")?;
for result in tsv_rdr.deserialize::<crate::io::tables::ClusterRow>() {
let row = result?;
writeln!(
out,
"{}\tGECCO\tgene_cluster\t{}\t{}\t.\t.\t.\tID={};type={}",
row.sequence_id, row.start, row.end, row.cluster_id, row.r#type,
)?;
}
info!("Wrote {:?}", out_path);
}
}
}
info!("Converted {} file(s)", entries.len());
Ok(())
}