use clap::*;
use itertools::Itertools;
use std::collections::BTreeMap;
use std::io::BufRead;
pub fn make_subcommand() -> Command {
Command::new("sort")
.about("Sort .rg and .tsv files by a range field")
.after_help(
r###"
* If no part of the line is a valid range, the line will be written to to the end of the output
* Using `--group` can improve performance on large datasets by grouping rows before sorting.
* The group_key can be chr_id, ctg_id, etc.
Example:
# Sort a .rg file
rgr sort tests/rgr/S288c.rg
# Sort a .tsv file by the first valid range
rgr sort tests/rgr/ctg.range.tsv
# Sort a .tsv file by a specific range field and treat the first line as a header
rgr sort tests/rgr/ctg.range.tsv -H -f 3
"###,
)
.arg(
Arg::new("infiles")
.required(true)
.num_args(1..)
.index(1)
.help("Input files to process. Multiple files can be specified"),
)
.arg(
Arg::new("header")
.long("header")
.short('H')
.action(ArgAction::SetTrue)
.help("Treat the first line of each file as a header"),
)
.arg(
Arg::new("field")
.long("field")
.short('f')
.num_args(1)
.value_parser(value_parser!(usize))
.help("Index of the range field. If not set, the first valid range will be used"),
)
.arg(
Arg::new("group")
.long("group")
.short('g')
.num_args(1)
.value_parser(value_parser!(usize))
.help("Group the rows by this field and then sort within each group"),
)
.arg(
Arg::new("outfile")
.long("outfile")
.short('o')
.num_args(1)
.default_value("stdout")
.help("Output filename. [stdout] for screen"),
)
}
pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let mut writer = intspan::writer(args.get_one::<String>("outfile").unwrap());
let is_header = args.get_flag("header");
let opt_idx_range = args.get_one::<usize>("field").copied().unwrap_or(0);
let opt_idx_group = args.get_one::<usize>("group").copied().unwrap_or(0);
let mut line_to_rg: BTreeMap<String, intspan::Range> = BTreeMap::new();
let mut invalids: Vec<String> = vec![];
for infile in args.get_many::<String>("infiles").unwrap() {
let reader = intspan::reader(infile);
'LINE: for (i, line) in reader.lines().map_while(Result::ok).enumerate() {
if is_header && i == 0 {
writer.write_fmt(format_args!("{}\n", line))?;
continue 'LINE;
}
if let Some(range) = intspan::extract_rg(&line, opt_idx_range) {
line_to_rg.insert(line.clone(), range);
} else {
invalids.push(line.clone());
}
}
}
let mut sorted: Vec<String> = vec![];
if opt_idx_group == 0 {
sorted = line_to_rg.keys().map(|e| e.to_string()).collect();
sorted.sort_by_cached_key(|k| {
let range = line_to_rg.get(k).unwrap();
(range.chr().clone(), range.start(), range.strand().clone())
});
} else {
let mut lines_of: BTreeMap<String, Vec<String>> = BTreeMap::new();
for line in line_to_rg.keys() {
let parts: Vec<&str> = line.split('\t').collect();
let group_key = parts.get(opt_idx_group - 1).unwrap();
lines_of
.entry(group_key.to_string())
.or_default()
.push(line.clone());
}
for group_key in lines_of.keys().sorted() {
let mut lines = lines_of.get(group_key).unwrap().clone();
lines.sort_by_cached_key(|k| {
let range = line_to_rg.get(k).unwrap();
(range.chr().clone(), range.start(), range.strand().clone())
});
sorted.extend(lines);
}
}
for line in &sorted {
writer.write_fmt(format_args!("{}\n", line))?;
}
for line in &invalids {
writer.write_fmt(format_args!("{}\n", line))?;
}
Ok(())
}