rustynetics 0.1.4

A high-performance genomics libary specialized in handling BAM and BigWig files
Documentation
use std::collections::HashMap;
use std::process;

use clap::{Arg, ArgAction, Command};

use rustynetics::granges::GRanges;
use rustynetics::meta::MetaData;

mod common;

#[derive(Clone, Copy)]
struct GtfRecord {
    from: usize,
    to: usize,
    strand: char,
}

impl GtfRecord {
    fn merge(&mut self, other: GtfRecord) {
        self.from = self.from.min(other.from);
        self.to = self.to.max(other.to);
        if self.strand != other.strand {
            self.strand = '*';
        }
    }
}

fn merge_rows(granges: &GRanges, merge_by: &str) -> Result<GRanges, String> {
    let values = granges
        .meta
        .get_column_str(merge_by)
        .ok_or_else(|| format!("attribute `{merge_by}` missing"))?;

    let mut entries: HashMap<String, HashMap<String, GtfRecord>> = HashMap::new();

    for (i, value) in values.iter().enumerate() {
        if value.is_empty() {
            continue;
        }
        let record = GtfRecord {
            from: granges.ranges[i].from,
            to: granges.ranges[i].to,
            strand: granges.strand[i],
        };
        let per_seq = entries.entry(value.clone()).or_default();
        if let Some(existing) = per_seq.get_mut(&granges.seqnames[i]) {
            existing.merge(record);
        } else {
            per_seq.insert(granges.seqnames[i].clone(), record);
        }
    }

    let mut names = Vec::new();
    let mut seqnames = Vec::new();
    let mut from = Vec::new();
    let mut to = Vec::new();
    let mut strand = Vec::new();

    let mut keys: Vec<_> = entries.into_iter().collect();
    keys.sort_by(|a, b| a.0.cmp(&b.0));

    for (name, per_seq) in keys {
        let mut per_seq: Vec<_> = per_seq.into_iter().collect();
        per_seq.sort_by(|a, b| a.0.cmp(&b.0));
        for (seqname, record) in per_seq {
            names.push(name.clone());
            seqnames.push(seqname);
            from.push(record.from);
            to.push(record.to);
            strand.push(record.strand);
        }
    }

    let mut merged = GRanges::new(seqnames, from, to, strand);
    merged
        .meta
        .add("name", MetaData::StringArray(names))
        .map_err(|error| error.to_string())?;
    merged
        .meta
        .add("score", MetaData::IntArray(vec![0; merged.num_rows()]))
        .map_err(|error| error.to_string())?;
    Ok(merged)
}

fn main() {
    let matches = Command::new("gtf-to-bed")
        .about("Convert GTF records to BED6")
        .arg(Arg::new("input").long("input").value_name("FILE"))
        .arg(Arg::new("output").long("output").value_name("FILE"))
        .arg(
            Arg::new("merge-by")
                .long("merge-by")
                .value_name("ATTRIBUTE"),
        )
        .arg(
            Arg::new("verbose")
                .short('v')
                .long("verbose")
                .action(ArgAction::SetTrue),
        )
        .get_matches();

    let input_path = matches.get_one::<String>("input").map(String::as_str);
    let output_path = matches.get_one::<String>("output").map(String::as_str);
    let merge_by = matches
        .get_one::<String>("merge-by")
        .map(String::as_str)
        .unwrap_or("");
    let verbose = matches.get_flag("verbose");

    let opt_names = if merge_by.is_empty() {
        Vec::new()
    } else {
        vec![merge_by]
    };
    let opt_types = if merge_by.is_empty() {
        Vec::new()
    } else {
        vec!["str"]
    };
    let defaults = if merge_by.is_empty() {
        Vec::new()
    } else {
        vec![Some("")]
    };

    let mut reader = common::open_reader(input_path).unwrap_or_else(|error| {
        eprintln!("opening GTF failed: {error}");
        process::exit(1);
    });
    if verbose {
        eprintln!("Reading GTF...");
    }
    let mut granges = GRanges::read_gtf(&mut reader, opt_names, opt_types, defaults)
        .unwrap_or_else(|error| {
            eprintln!("reading GTF failed: {error}");
            process::exit(1);
        });

    granges.meta.rename_meta("score", "gtfScore");

    let mut bed = if merge_by.is_empty() {
        granges.meta.rename_meta("feature", "name");
        granges
    } else {
        merge_rows(&granges, merge_by).unwrap_or_else(|error| {
            eprintln!("merging rows failed: {error}");
            process::exit(1);
        })
    };

    if bed.meta.get_column_int("score").is_none() {
        if let Err(error) = bed
            .meta
            .add("score", MetaData::IntArray(vec![0; bed.num_rows()]))
        {
            eprintln!("adding BED score column failed: {error}");
            process::exit(1);
        }
    }

    let mut writer = common::open_writer(output_path).unwrap_or_else(|error| {
        eprintln!("opening output failed: {error}");
        process::exit(1);
    });
    if let Err(error) = bed.write_bed6(&mut writer) {
        eprintln!("writing BED failed: {error}");
        process::exit(1);
    }
}