Skip to main content

rsomics_vcf_reheader/
lib.rs

1use std::collections::HashMap;
2use std::io::{self, BufRead, BufReader, BufWriter, Write};
3use std::path::Path;
4
5use rsomics_common::{Result, RsomicsError};
6
7/// Replace the entire header (all `#`-prefixed lines) with the contents of
8/// `header_file`, then stream data records unchanged.
9pub fn reheader_replace(
10    input: &mut dyn io::Read,
11    header_file: &Path,
12    output: &mut dyn io::Write,
13) -> Result<u64> {
14    let new_header = std::fs::read_to_string(header_file).map_err(RsomicsError::Io)?;
15    let mut out = BufWriter::new(output);
16
17    // Emit the replacement header, ensuring each line ends with exactly one newline.
18    for line in new_header.lines() {
19        out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
20        out.write_all(b"\n").map_err(RsomicsError::Io)?;
21    }
22
23    // Stream data records from the input, skipping its original header.
24    let mut reader = BufReader::new(input);
25    let mut line = String::new();
26    let mut records: u64 = 0;
27    loop {
28        line.clear();
29        let n = reader.read_line(&mut line).map_err(RsomicsError::Io)?;
30        if n == 0 {
31            break;
32        }
33        let trimmed = line.trim_end_matches(['\n', '\r']);
34        if trimmed.is_empty() {
35            continue;
36        }
37        if trimmed.starts_with('#') {
38            // Skip original header lines.
39            continue;
40        }
41        out.write_all(trimmed.as_bytes())
42            .map_err(RsomicsError::Io)?;
43        out.write_all(b"\n").map_err(RsomicsError::Io)?;
44        records += 1;
45    }
46
47    out.flush().map_err(RsomicsError::Io)?;
48    Ok(records)
49}
50
51/// Parse a samples file.
52///
53/// Each line is either:
54/// - a single name (positional rename, index-based), or
55/// - `old_name  new_name` (map-based rename).
56///
57/// Returns `(positional, map)`. If any line has two whitespace-separated
58/// tokens the map form is used; otherwise positional.
59fn parse_samples_file(path: &Path) -> Result<(Vec<String>, HashMap<String, String>)> {
60    let content = std::fs::read_to_string(path).map_err(RsomicsError::Io)?;
61    let mut positional: Vec<String> = Vec::new();
62    let mut map: HashMap<String, String> = HashMap::new();
63    let mut map_mode = false;
64
65    for raw in content.lines() {
66        let line = raw.trim();
67        if line.is_empty() {
68            continue;
69        }
70        let mut parts = line.splitn(2, |c: char| c.is_whitespace());
71        let first = parts.next().unwrap_or("").trim();
72        let second = parts.next().map(|s| s.trim());
73        if let Some(new_name) = second
74            && !new_name.is_empty()
75        {
76            map_mode = true;
77            map.insert(first.to_owned(), new_name.to_owned());
78            continue;
79        }
80        positional.push(first.to_owned());
81    }
82
83    if map_mode {
84        Ok((Vec::new(), map))
85    } else {
86        Ok((positional, HashMap::new()))
87    }
88}
89
90/// Rename samples on the `#CHROM` line, leaving all other header lines and
91/// all data records unchanged.
92pub fn reheader_samples(
93    input: &mut dyn io::Read,
94    samples_file: &Path,
95    output: &mut dyn io::Write,
96) -> Result<u64> {
97    let (positional, map) = parse_samples_file(samples_file)?;
98    let mut out = BufWriter::new(output);
99    let mut reader = BufReader::new(input);
100    let mut line = String::new();
101    let mut records: u64 = 0;
102
103    loop {
104        line.clear();
105        let n = reader.read_line(&mut line).map_err(RsomicsError::Io)?;
106        if n == 0 {
107            break;
108        }
109        let trimmed = line.trim_end_matches(['\n', '\r']);
110        if trimmed.is_empty() {
111            continue;
112        }
113
114        if trimmed.starts_with('#') {
115            if trimmed.starts_with("#CHROM") {
116                // Rewrite sample columns (fields 9+).
117                let cols: Vec<&str> = trimmed.split('\t').collect();
118                let fixed_cols = if cols.len() > 9 {
119                    let fixed: Vec<String> = cols[9..]
120                        .iter()
121                        .enumerate()
122                        .map(|(i, old)| {
123                            if !map.is_empty() {
124                                map.get(*old).map(String::as_str).unwrap_or(old).to_owned()
125                            } else if i < positional.len() {
126                                positional[i].clone()
127                            } else {
128                                old.to_string()
129                            }
130                        })
131                        .collect();
132                    [
133                        &cols[..9],
134                        fixed
135                            .iter()
136                            .map(String::as_str)
137                            .collect::<Vec<_>>()
138                            .as_slice(),
139                    ]
140                    .concat()
141                    .join("\t")
142                } else {
143                    trimmed.to_owned()
144                };
145                out.write_all(fixed_cols.as_bytes())
146                    .map_err(RsomicsError::Io)?;
147            } else {
148                out.write_all(trimmed.as_bytes())
149                    .map_err(RsomicsError::Io)?;
150            }
151            out.write_all(b"\n").map_err(RsomicsError::Io)?;
152        } else {
153            out.write_all(trimmed.as_bytes())
154                .map_err(RsomicsError::Io)?;
155            out.write_all(b"\n").map_err(RsomicsError::Io)?;
156            records += 1;
157        }
158    }
159
160    out.flush().map_err(RsomicsError::Io)?;
161    Ok(records)
162}
163
164/// Pass-through: copy input to output unchanged (neither -h nor -s given).
165pub fn passthrough(input: &mut dyn io::Read, output: &mut dyn io::Write) -> Result<u64> {
166    let mut out = BufWriter::new(output);
167    let mut reader = BufReader::new(input);
168    let mut line = String::new();
169    let mut records: u64 = 0;
170    loop {
171        line.clear();
172        let n = reader.read_line(&mut line).map_err(RsomicsError::Io)?;
173        if n == 0 {
174            break;
175        }
176        let trimmed = line.trim_end_matches(['\n', '\r']);
177        if trimmed.is_empty() {
178            continue;
179        }
180        out.write_all(trimmed.as_bytes())
181            .map_err(RsomicsError::Io)?;
182        out.write_all(b"\n").map_err(RsomicsError::Io)?;
183        if !trimmed.starts_with('#') {
184            records += 1;
185        }
186    }
187    out.flush().map_err(RsomicsError::Io)?;
188    Ok(records)
189}