rsomics-vcf-reheader 0.1.0

Replace a VCF header or rename samples — Rust port of bcftools reheader
Documentation
use std::collections::HashMap;
use std::io::{self, BufRead, BufReader, BufWriter, Write};
use std::path::Path;

use rsomics_common::{Result, RsomicsError};

/// Replace the entire header (all `#`-prefixed lines) with the contents of
/// `header_file`, then stream data records unchanged.
pub fn reheader_replace(
    input: &mut dyn io::Read,
    header_file: &Path,
    output: &mut dyn io::Write,
) -> Result<u64> {
    let new_header = std::fs::read_to_string(header_file).map_err(RsomicsError::Io)?;
    let mut out = BufWriter::new(output);

    // Emit the replacement header, ensuring each line ends with exactly one newline.
    for line in new_header.lines() {
        out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
        out.write_all(b"\n").map_err(RsomicsError::Io)?;
    }

    // Stream data records from the input, skipping its original header.
    let mut reader = BufReader::new(input);
    let mut line = String::new();
    let mut records: u64 = 0;
    loop {
        line.clear();
        let n = reader.read_line(&mut line).map_err(RsomicsError::Io)?;
        if n == 0 {
            break;
        }
        let trimmed = line.trim_end_matches(['\n', '\r']);
        if trimmed.is_empty() {
            continue;
        }
        if trimmed.starts_with('#') {
            // Skip original header lines.
            continue;
        }
        out.write_all(trimmed.as_bytes())
            .map_err(RsomicsError::Io)?;
        out.write_all(b"\n").map_err(RsomicsError::Io)?;
        records += 1;
    }

    out.flush().map_err(RsomicsError::Io)?;
    Ok(records)
}

/// Parse a samples file.
///
/// Each line is either:
/// - a single name (positional rename, index-based), or
/// - `old_name  new_name` (map-based rename).
///
/// Returns `(positional, map)`. If any line has two whitespace-separated
/// tokens the map form is used; otherwise positional.
fn parse_samples_file(path: &Path) -> Result<(Vec<String>, HashMap<String, String>)> {
    let content = std::fs::read_to_string(path).map_err(RsomicsError::Io)?;
    let mut positional: Vec<String> = Vec::new();
    let mut map: HashMap<String, String> = HashMap::new();
    let mut map_mode = false;

    for raw in content.lines() {
        let line = raw.trim();
        if line.is_empty() {
            continue;
        }
        let mut parts = line.splitn(2, |c: char| c.is_whitespace());
        let first = parts.next().unwrap_or("").trim();
        let second = parts.next().map(|s| s.trim());
        if let Some(new_name) = second
            && !new_name.is_empty()
        {
            map_mode = true;
            map.insert(first.to_owned(), new_name.to_owned());
            continue;
        }
        positional.push(first.to_owned());
    }

    if map_mode {
        Ok((Vec::new(), map))
    } else {
        Ok((positional, HashMap::new()))
    }
}

/// Rename samples on the `#CHROM` line, leaving all other header lines and
/// all data records unchanged.
pub fn reheader_samples(
    input: &mut dyn io::Read,
    samples_file: &Path,
    output: &mut dyn io::Write,
) -> Result<u64> {
    let (positional, map) = parse_samples_file(samples_file)?;
    let mut out = BufWriter::new(output);
    let mut reader = BufReader::new(input);
    let mut line = String::new();
    let mut records: u64 = 0;

    loop {
        line.clear();
        let n = reader.read_line(&mut line).map_err(RsomicsError::Io)?;
        if n == 0 {
            break;
        }
        let trimmed = line.trim_end_matches(['\n', '\r']);
        if trimmed.is_empty() {
            continue;
        }

        if trimmed.starts_with('#') {
            if trimmed.starts_with("#CHROM") {
                // Rewrite sample columns (fields 9+).
                let cols: Vec<&str> = trimmed.split('\t').collect();
                let fixed_cols = if cols.len() > 9 {
                    let fixed: Vec<String> = cols[9..]
                        .iter()
                        .enumerate()
                        .map(|(i, old)| {
                            if !map.is_empty() {
                                map.get(*old).map(String::as_str).unwrap_or(old).to_owned()
                            } else if i < positional.len() {
                                positional[i].clone()
                            } else {
                                old.to_string()
                            }
                        })
                        .collect();
                    [
                        &cols[..9],
                        fixed
                            .iter()
                            .map(String::as_str)
                            .collect::<Vec<_>>()
                            .as_slice(),
                    ]
                    .concat()
                    .join("\t")
                } else {
                    trimmed.to_owned()
                };
                out.write_all(fixed_cols.as_bytes())
                    .map_err(RsomicsError::Io)?;
            } else {
                out.write_all(trimmed.as_bytes())
                    .map_err(RsomicsError::Io)?;
            }
            out.write_all(b"\n").map_err(RsomicsError::Io)?;
        } else {
            out.write_all(trimmed.as_bytes())
                .map_err(RsomicsError::Io)?;
            out.write_all(b"\n").map_err(RsomicsError::Io)?;
            records += 1;
        }
    }

    out.flush().map_err(RsomicsError::Io)?;
    Ok(records)
}

/// Pass-through: copy input to output unchanged (neither -h nor -s given).
pub fn passthrough(input: &mut dyn io::Read, output: &mut dyn io::Write) -> Result<u64> {
    let mut out = BufWriter::new(output);
    let mut reader = BufReader::new(input);
    let mut line = String::new();
    let mut records: u64 = 0;
    loop {
        line.clear();
        let n = reader.read_line(&mut line).map_err(RsomicsError::Io)?;
        if n == 0 {
            break;
        }
        let trimmed = line.trim_end_matches(['\n', '\r']);
        if trimmed.is_empty() {
            continue;
        }
        out.write_all(trimmed.as_bytes())
            .map_err(RsomicsError::Io)?;
        out.write_all(b"\n").map_err(RsomicsError::Io)?;
        if !trimmed.starts_with('#') {
            records += 1;
        }
    }
    out.flush().map_err(RsomicsError::Io)?;
    Ok(records)
}