use anyhow::{Context, Result};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use crate::cli::MakeAnnotArgs;
use crate::parse::{BimRecord, parse_bim};
pub fn run(args: MakeAnnotArgs) -> Result<()> {
let snps =
parse_bim(&args.bimfile).with_context(|| format!("reading BIM '{}'", args.bimfile))?;
println!("Read {} SNPs from '{}'", snps.len(), args.bimfile);
let annotation: Vec<u8> = if let Some(ref bed_path) = args.bed_file {
annotate_from_bed(&snps, bed_path, args.windowsize, args.nomerge)?
} else if args.gene_set_file.is_some() && args.gene_coord_file.is_some() {
annotate_from_gene_set(
&snps,
args.gene_set_file.as_deref().unwrap(),
args.gene_coord_file.as_deref().unwrap(),
args.windowsize,
)?
} else {
anyhow::bail!(
"Must provide either --bed-file or both --gene-set-file and --gene-coord-file"
);
};
let col_name = derive_annot_name(&args);
write_annot_file(&args.annot_file, &snps, &annotation, &col_name)
.with_context(|| format!("writing annotation file '{}'", args.annot_file))?;
let n_annotated = annotation.iter().filter(|&&v| v == 1).count();
println!(
"Wrote annotation '{}' to '{}': {}/{} SNPs annotated",
col_name,
args.annot_file,
n_annotated,
snps.len()
);
Ok(())
}
fn annotate_from_bed(
snps: &[BimRecord],
bed_path: &str,
windowsize: u32,
nomerge: bool,
) -> Result<Vec<u8>> {
let intervals = load_bed_intervals(bed_path, windowsize, nomerge)?;
println!(
" Loaded BED intervals for {} chromosomes from '{}'",
intervals.len(),
bed_path
);
Ok(snps.iter().map(|s| annotate_snp(s, &intervals)).collect())
}
fn load_bed_intervals(
bed_path: &str,
windowsize: u32,
nomerge: bool,
) -> Result<HashMap<String, Vec<(u32, u32)>>> {
let file = File::open(bed_path).with_context(|| format!("opening BED file '{}'", bed_path))?;
let reader = BufReader::new(file);
let mut intervals: HashMap<String, Vec<(u32, u32)>> = HashMap::new();
for (i, line) in reader.lines().enumerate() {
let line = line.with_context(|| format!("reading BED line {}", i + 1))?;
let line = line.trim();
if line.is_empty()
|| line.starts_with('#')
|| line.starts_with("track")
|| line.starts_with("browser")
{
continue;
}
let cols: Vec<&str> = line.split('\t').collect();
if cols.len() < 3 {
anyhow::bail!(
"BED line {}: expected at least 3 columns (chrom start end), got {}",
i + 1,
cols.len()
);
}
let chrom = cols[0].trim_start_matches("chr").to_string();
let start: u32 = cols[1]
.parse()
.with_context(|| format!("BED line {}: invalid start '{}'", i + 1, cols[1]))?;
let end: u32 = cols[2]
.parse()
.with_context(|| format!("BED line {}: invalid end '{}'", i + 1, cols[2]))?;
let start = start.saturating_sub(windowsize);
let end = end.saturating_add(windowsize);
intervals.entry(chrom).or_default().push((start, end));
}
for ivs in intervals.values_mut() {
ivs.sort_by_key(|&(s, _)| s);
if !nomerge {
*ivs = merge_intervals(std::mem::take(ivs));
}
}
Ok(intervals)
}
fn merge_intervals(sorted: Vec<(u32, u32)>) -> Vec<(u32, u32)> {
if sorted.is_empty() {
return sorted;
}
let mut merged: Vec<(u32, u32)> = Vec::with_capacity(sorted.len());
let mut current = sorted[0];
for &(s, e) in &sorted[1..] {
if s <= current.1 {
current.1 = current.1.max(e);
} else {
merged.push(current);
current = (s, e);
}
}
merged.push(current);
merged
}
fn annotate_snp(snp: &BimRecord, intervals: &HashMap<String, Vec<(u32, u32)>>) -> u8 {
let chr_str = snp.chr.to_string();
if let Some(ivs) = intervals.get(&chr_str)
&& is_in_intervals(ivs, snp.bp)
{
return 1;
}
0
}
fn is_in_intervals(intervals: &[(u32, u32)], bp: u32) -> bool {
let pos = intervals.partition_point(|&(s, _)| s < bp);
if pos > 0 {
let (_, end) = intervals[pos - 1];
bp <= end } else {
false
}
}
fn annotate_from_gene_set(
snps: &[BimRecord],
gene_set_path: &str,
coord_path: &str,
windowsize: u32,
) -> Result<Vec<u8>> {
let gene_set_file = File::open(gene_set_path)
.with_context(|| format!("opening gene set file '{}'", gene_set_path))?;
let target_genes: HashSet<String> = BufReader::new(gene_set_file)
.lines()
.map_while(Result::ok)
.map(|l| l.trim().to_string())
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.collect();
println!(
" Loaded {} gene symbols from '{}'",
target_genes.len(),
gene_set_path
);
let coord_file = File::open(coord_path)
.with_context(|| format!("opening gene coordinate file '{}'", coord_path))?;
let mut intervals: HashMap<String, Vec<(u32, u32)>> = HashMap::new();
let mut genes_found = 0usize;
for (i, line) in BufReader::new(coord_file).lines().enumerate() {
let line = line.with_context(|| format!("reading coord file line {}", i + 1))?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let cols: Vec<&str> = line.split_whitespace().collect();
if cols.len() < 4 {
anyhow::bail!(
"Gene coord file line {}: expected GENE CHR START END, got {} columns",
i + 1,
cols.len()
);
}
let gene = cols[0];
if !target_genes.contains(gene) {
continue;
}
genes_found += 1;
let chr = cols[1].trim_start_matches("chr").to_string();
let start: u32 = cols[2]
.parse()
.with_context(|| format!("coord file line {}: invalid start '{}'", i + 1, cols[2]))?;
let end: u32 = cols[3]
.parse()
.with_context(|| format!("coord file line {}: invalid end '{}'", i + 1, cols[3]))?;
let start = start.saturating_sub(windowsize);
let end = end.saturating_add(windowsize);
intervals.entry(chr).or_default().push((start, end));
}
println!(
" Found coordinates for {}/{} target genes",
genes_found,
target_genes.len()
);
for ivs in intervals.values_mut() {
ivs.sort_by_key(|&(s, _)| s);
*ivs = merge_intervals(std::mem::take(ivs));
}
Ok(snps.iter().map(|s| annotate_snp(s, &intervals)).collect())
}
fn derive_annot_name(args: &MakeAnnotArgs) -> String {
let source = args
.bed_file
.as_deref()
.or(args.gene_set_file.as_deref())
.unwrap_or("annot");
std::path::Path::new(source)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("ANNOT")
.to_string()
}
fn write_annot_file(
path: &str,
snps: &[BimRecord],
annotation: &[u8],
col_name: &str,
) -> Result<()> {
if path.ends_with(".gz") {
use flate2::Compression;
use flate2::write::GzEncoder;
let file = File::create(path).with_context(|| format!("creating '{}'", path))?;
let mut gz = GzEncoder::new(file, Compression::fast());
write_rows(&mut gz, snps, annotation, col_name)?;
gz.finish().context("finalizing gzip output")?;
} else {
let file = File::create(path).with_context(|| format!("creating '{}'", path))?;
let mut w = BufWriter::new(file);
write_rows(&mut w, snps, annotation, col_name)?;
}
Ok(())
}
fn write_rows(
w: &mut impl Write,
snps: &[BimRecord],
annotation: &[u8],
col_name: &str,
) -> Result<()> {
writeln!(w, "CHR\tBP\tSNP\tCM\t{}", col_name)?;
for (snp, &ann) in snps.iter().zip(annotation.iter()) {
writeln!(
w,
"{}\t{}\t{}\t{:.6}\t{}",
snp.chr, snp.bp, snp.snp, snp.cm, ann
)?;
}
Ok(())
}