use std::fs::File;
use std::io::{BufWriter, Write, stdout};
use std::path::PathBuf;
use anyhow::Result;
use fgoxide::io::DelimFileWriter;
use serde::Serialize;
use crate::commands::command::Command;
use crate::report::KrakenReport;
#[derive(clap::Args)]
pub struct ReportToTsv {
#[arg(short = 'r', long)]
kraken_report: PathBuf,
#[arg(short, long)]
output: Option<PathBuf>,
}
#[derive(Default, Serialize)]
struct TsvRow {
tax_id: u64,
name: String,
rank: String,
level: usize,
parent_tax_id: String,
parent_rank: String,
clade_count: u64,
direct_count: u64,
descendant_count: u64,
frac_clade: f64,
frac_direct: f64,
frac_descendant: f64,
minimizer_count: String,
distinct_minimizer_count: String,
}
fn tsv_header() -> String {
let mut csv_writer =
csv::WriterBuilder::new().delimiter(b'\t').has_headers(true).from_writer(Vec::new());
csv_writer.serialize(TsvRow::default()).unwrap();
csv_writer.flush().unwrap();
let bytes = csv_writer.into_inner().unwrap();
let text = String::from_utf8(bytes).unwrap();
text.lines().next().unwrap().to_string()
}
impl Command for ReportToTsv {
fn execute(&self) -> Result<()> {
let report = KrakenReport::from_path(&self.kraken_report)?;
let rows = build_tsv_rows(&report);
let writer: BufWriter<Box<dyn Write + Send>> = match &self.output {
Some(path) => {
let file = File::create(path).map_err(|e| {
anyhow::anyhow!("failed to create output {}: {e}", path.display())
})?;
BufWriter::new(Box::new(file))
}
None => BufWriter::new(Box::new(stdout())),
};
if rows.is_empty() {
let mut w = writer;
writeln!(w, "{}", tsv_header())?;
w.flush()?;
} else {
let mut tsv_writer = DelimFileWriter::new(writer, b'\t', true);
tsv_writer.write_all(rows)?;
}
log::info!("Wrote {} rows to TSV.", report.len());
Ok(())
}
}
fn build_tsv_rows(report: &KrakenReport) -> Vec<TsvRow> {
let total_sequences = report.total_sequences();
let has_minimizer_data = report.has_minimizer_data();
let mut tsv_rows = Vec::with_capacity(report.len());
for (i, row) in report.rows().iter().enumerate() {
let clade_count = row.clade_count();
let direct_count = row.direct_count();
let descendant_count = clade_count - direct_count;
#[allow(clippy::cast_precision_loss)]
let (frac_clade, frac_direct, frac_descendant) = if total_sequences > 0 {
(
clade_count as f64 / total_sequences as f64,
direct_count as f64 / total_sequences as f64,
descendant_count as f64 / total_sequences as f64,
)
} else {
(0.0, 0.0, 0.0)
};
let (minimizer_count, distinct_minimizer_count) = if has_minimizer_data {
let mc = row.minimizer_count().unwrap_or(0);
let dmc = row.distinct_minimizer_count().unwrap_or(0);
(mc.to_string(), dmc.to_string())
} else {
(String::new(), String::new())
};
let (parent_tax_id, parent_rank) = match report.parent(i) {
Some(parent) => (parent.taxon_id().to_string(), parent.taxonomic_rank().to_string()),
None => (String::new(), String::new()),
};
tsv_rows.push(TsvRow {
tax_id: row.taxon_id(),
name: row.name().to_string(),
rank: row.taxonomic_rank().to_string(),
level: row.depth(),
parent_tax_id,
parent_rank,
clade_count,
direct_count,
descendant_count,
frac_clade,
frac_direct,
frac_descendant,
minimizer_count,
distinct_minimizer_count,
});
}
tsv_rows
}
#[cfg(test)]
mod tests {
use super::*;
fn standard_line(
pct: f64,
clade: u64,
direct: u64,
rank: &str,
taxid: u64,
name: &str,
depth: usize,
) -> String {
let indent = " ".repeat(depth * 2);
format!("{pct:.2}\t{clade}\t{direct}\t{rank}\t{taxid}\t{indent}{name}")
}
#[allow(clippy::too_many_arguments)]
fn extended_line(
pct: f64,
clade: u64,
direct: u64,
minimizers: u64,
distinct: u64,
rank: &str,
taxid: u64,
name: &str,
depth: usize,
) -> String {
let indent = " ".repeat(depth * 2);
format!(
"{pct:.2}\t{clade}\t{direct}\t{minimizers}\t{distinct}\t{rank}\t{taxid}\t{indent}{name}"
)
}
fn parse(report: &str) -> KrakenReport {
KrakenReport::from_reader(report.as_bytes()).unwrap()
}
fn make_standard_report() -> KrakenReport {
parse(
&[
standard_line(10.0, 100, 100, "U", 0, "unclassified", 0),
standard_line(90.0, 900, 5, "R", 1, "root", 0),
standard_line(60.0, 600, 10, "D", 2, "Bacteria", 1),
standard_line(50.0, 500, 500, "S", 3, "Escherichia coli", 2),
standard_line(30.0, 300, 10, "D", 4, "Eukaryota", 1),
standard_line(20.0, 200, 200, "S", 5, "Homo sapiens", 2),
]
.join("\n"),
)
}
fn make_extended_report() -> KrakenReport {
parse(
&[
extended_line(10.0, 100, 100, 0, 0, "U", 0, "unclassified", 0),
extended_line(90.0, 900, 5, 500, 400, "R", 1, "root", 0),
extended_line(60.0, 600, 10, 300, 250, "D", 2, "Bacteria", 1),
extended_line(50.0, 500, 500, 200, 150, "S", 3, "Escherichia coli", 2),
extended_line(30.0, 300, 10, 200, 150, "D", 4, "Eukaryota", 1),
extended_line(20.0, 200, 200, 100, 80, "S", 5, "Homo sapiens", 2),
]
.join("\n"),
)
}
fn write_rows_to_string(rows: Vec<TsvRow>) -> String {
let dir = tempfile::TempDir::new().unwrap();
let path = dir.path().join("out.tsv");
let df = fgoxide::io::DelimFile::default();
df.write_tsv(&path, rows).unwrap();
std::fs::read_to_string(path).unwrap()
}
#[test]
fn test_level_values() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
assert_eq!(rows[0].level, 0); assert_eq!(rows[1].level, 0); assert_eq!(rows[2].level, 1); assert_eq!(rows[3].level, 2); assert_eq!(rows[4].level, 1); assert_eq!(rows[5].level, 2); }
#[test]
fn test_parent_fields_empty_for_root_and_unclassified() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
assert_eq!(rows[0].parent_tax_id, "");
assert_eq!(rows[0].parent_rank, "");
assert_eq!(rows[1].parent_tax_id, "");
assert_eq!(rows[1].parent_rank, "");
}
#[test]
fn test_parent_fields_populated_for_children() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
assert_eq!(rows[2].parent_tax_id, "1");
assert_eq!(rows[2].parent_rank, "R");
assert_eq!(rows[3].parent_tax_id, "2");
assert_eq!(rows[3].parent_rank, "D");
assert_eq!(rows[5].parent_tax_id, "4");
assert_eq!(rows[5].parent_rank, "D");
}
#[test]
fn test_descendant_count() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
assert_eq!(rows[0].descendant_count, 0);
assert_eq!(rows[1].descendant_count, 895);
assert_eq!(rows[2].descendant_count, 590);
assert_eq!(rows[3].descendant_count, 0);
}
#[test]
fn test_minimizer_columns_empty_for_standard_report() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
for row in &rows {
assert_eq!(row.minimizer_count, "");
assert_eq!(row.distinct_minimizer_count, "");
}
}
#[test]
fn test_minimizer_columns_populated_for_extended_report() {
let report = make_extended_report();
let rows = build_tsv_rows(&report);
assert_eq!(rows[1].minimizer_count, "500");
assert_eq!(rows[1].distinct_minimizer_count, "400");
assert_eq!(rows[2].minimizer_count, "300");
assert_eq!(rows[2].distinct_minimizer_count, "250");
for row in &rows {
assert!(!row.minimizer_count.is_empty());
assert!(!row.distinct_minimizer_count.is_empty());
}
}
#[test]
fn test_fraction_calculations() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
let ecoli = &rows[3];
assert!((ecoli.frac_clade - 0.5).abs() < 1e-9);
assert!((ecoli.frac_direct - 0.5).abs() < 1e-9);
assert!((ecoli.frac_descendant - 0.0).abs() < 1e-9);
let bacteria = &rows[2];
assert!((bacteria.frac_clade - 0.6).abs() < 1e-9);
assert!((bacteria.frac_direct - 0.01).abs() < 1e-9);
assert!((bacteria.frac_descendant - 0.59).abs() < 1e-9);
}
#[test]
fn test_fractions_zero_when_no_sequences() {
let report = parse(
&[
standard_line(0.0, 0, 0, "U", 0, "unclassified", 0),
standard_line(0.0, 0, 0, "R", 1, "root", 0),
]
.join("\n"),
);
let rows = build_tsv_rows(&report);
for row in &rows {
assert!((row.frac_clade - 0.0).abs() < 1e-9);
assert!((row.frac_direct - 0.0).abs() < 1e-9);
assert!((row.frac_descendant - 0.0).abs() < 1e-9);
}
}
#[test]
fn test_basic_field_values() {
let report = make_standard_report();
let rows = build_tsv_rows(&report);
assert_eq!(rows[0].tax_id, 0);
assert_eq!(rows[0].name, "unclassified");
assert_eq!(rows[0].rank, "U");
assert_eq!(rows[0].clade_count, 100);
assert_eq!(rows[0].direct_count, 100);
assert_eq!(rows[3].tax_id, 3);
assert_eq!(rows[3].name, "Escherichia coli");
assert_eq!(rows[3].rank, "S");
}
#[test]
fn test_empty_report_produces_no_rows() {
let report = parse("");
let rows = build_tsv_rows(&report);
assert!(rows.is_empty());
}
#[test]
fn test_write_tsv_header_and_rows() {
let report = parse(
&[
standard_line(50.0, 5, 5, "U", 0, "unclassified", 0),
standard_line(50.0, 5, 5, "R", 1, "root", 0),
]
.join("\n"),
);
let rows = build_tsv_rows(&report);
let text = write_rows_to_string(rows);
let lines: Vec<&str> = text.lines().collect();
assert_eq!(lines.len(), 3);
let header_cols: Vec<&str> = lines[0].split('\t').collect();
assert_eq!(header_cols[0], "tax_id");
assert_eq!(header_cols[3], "level");
assert_eq!(header_cols[6], "clade_count");
assert_eq!(header_cols[9], "frac_clade");
assert_eq!(header_cols[12], "minimizer_count");
assert_eq!(header_cols[13], "distinct_minimizer_count");
assert_eq!(header_cols.len(), 14);
for line in &lines[1..] {
assert_eq!(line.split('\t').count(), header_cols.len());
}
}
}