Skip to main content

fgumi_lib/
header.rs

1//! Utilities for adding @PG (program) records to SAM headers.
2//!
3//! This module provides functions for managing @PG records in SAM/BAM headers,
4//! including automatic PP (previous program) chaining and ID collision handling.
5
6use anyhow::Result;
7use bstr::BString;
8use noodles::sam::Header;
9use noodles::sam::header::record::value::Map;
10use noodles::sam::header::record::value::map::Program;
11use noodles::sam::header::record::value::map::program::tag;
12use std::collections::HashSet;
13
14/// Get the ID of the last program in the @PG chain (for PP chaining).
15///
16/// Finds the program that is not referenced by any other program's PP tag,
17/// i.e., the "leaf" of the chain.
18///
19/// # Arguments
20///
21/// * `header` - The SAM header to search
22///
23/// # Returns
24///
25/// The ID of the last program in the chain, or `None` if there are no programs.
26#[must_use]
27pub fn get_last_program_id(header: &Header) -> Option<String> {
28    let programs = header.programs();
29    let program_map = programs.as_ref();
30
31    if program_map.is_empty() {
32        return None;
33    }
34
35    // Collect all program IDs that are referenced as PP by other programs
36    let mut referenced: HashSet<&[u8]> = HashSet::new();
37    for (_id, pg) in program_map {
38        if let Some(pp) = pg.other_fields().get(&tag::PREVIOUS_PROGRAM_ID) {
39            referenced.insert(pp.as_ref());
40        }
41    }
42
43    // Find a program that is NOT referenced (the leaf/end of chain)
44    for (id, _pg) in program_map {
45        if !referenced.contains(id.as_slice()) {
46            return Some(String::from_utf8_lossy(id).to_string());
47        }
48    }
49
50    // Fallback: return any program ID (shouldn't happen with valid headers)
51    program_map.keys().next().map(|id| String::from_utf8_lossy(id).to_string())
52}
53
54/// Create a unique program ID by appending .1, .2, etc. if needed.
55///
56/// # Arguments
57///
58/// * `header` - The SAM header to check for existing IDs
59/// * `base_id` - The base program ID to use (e.g., "fgumi")
60///
61/// # Returns
62///
63/// A unique program ID, either the base ID or with a numeric suffix.
64#[must_use]
65pub fn make_unique_program_id(header: &Header, base_id: &str) -> String {
66    let programs = header.programs();
67    let program_map = programs.as_ref();
68
69    // Check if base ID is available
70    if !program_map.contains_key(base_id.as_bytes()) {
71        return base_id.to_string();
72    }
73
74    // Append numeric suffix until unique
75    for i in 1..=1000 {
76        let candidate = format!("{base_id}.{i}");
77        if !program_map.contains_key(candidate.as_bytes()) {
78            return candidate;
79        }
80    }
81
82    // Extremely unlikely fallback
83    format!("{base_id}.{}", std::process::id())
84}
85
86/// Build a @PG record with all standard fields.
87///
88/// # Arguments
89///
90/// * `version` - Program version string
91/// * `command_line` - Full command line invocation
92/// * `previous_program` - Optional ID of previous program for PP chaining
93///
94/// # Returns
95///
96/// A `Map<Program>` ready to add to a header.
97/// # Errors
98///
99/// Returns an error if the program record cannot be built.
100pub fn build_program_record(
101    version: &str,
102    command_line: &str,
103    previous_program: Option<&str>,
104) -> Result<Map<Program>> {
105    let mut builder = Map::<Program>::builder()
106        .insert(tag::NAME, "fgumi")
107        .insert(tag::VERSION, version)
108        .insert(tag::COMMAND_LINE, command_line);
109
110    if let Some(pp) = previous_program {
111        builder = builder.insert(tag::PREVIOUS_PROGRAM_ID, pp);
112    }
113
114    Ok(builder.build()?)
115}
116
117/// Add a @PG record to an existing header with automatic PP chaining.
118///
119/// This function:
120/// 1. Finds the last program in the existing @PG chain
121/// 2. Creates a unique ID (appending .1, .2 if "fgumi" exists)
122/// 3. Adds the new @PG with PP pointing to the previous program
123///
124/// # Arguments
125///
126/// * `header` - The header to modify
127/// * `version` - Program version string
128/// * `command_line` - Full command line invocation
129///
130/// # Returns
131///
132/// The modified header with the new @PG record.
133/// # Errors
134///
135/// Returns an error if the program record cannot be added to the header.
136pub fn add_pg_record(mut header: Header, version: &str, command_line: &str) -> Result<Header> {
137    let previous_program = get_last_program_id(&header);
138    let unique_id = make_unique_program_id(&header, "fgumi");
139    let pg_record = build_program_record(version, command_line, previous_program.as_deref())?;
140
141    header.programs_mut().add(BString::from(unique_id), pg_record)?;
142
143    Ok(header)
144}
145
146/// Add a @PG record to a header builder (for commands creating new headers).
147///
148/// Use this when building a header from scratch (no PP chaining needed).
149///
150/// # Arguments
151///
152/// * `builder` - The header builder to modify
153/// * `version` - Program version string
154/// * `command_line` - Full command line invocation
155///
156/// # Returns
157///
158/// The modified header builder.
159/// # Errors
160///
161/// Returns an error if the program record cannot be built.
162pub fn add_pg_to_builder(
163    builder: noodles::sam::header::Builder,
164    version: &str,
165    command_line: &str,
166) -> Result<noodles::sam::header::Builder> {
167    let pg_record = build_program_record(version, command_line, None)?;
168    Ok(builder.add_program("fgumi", pg_record))
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn test_get_last_program_id_empty() {
177        let header = Header::default();
178        assert_eq!(get_last_program_id(&header), None);
179    }
180
181    #[test]
182    fn test_get_last_program_id_single() {
183        let mut header = Header::default();
184        let pg = Map::<Program>::default();
185        header.programs_mut().add(BString::from("bwa"), pg).unwrap();
186        assert_eq!(get_last_program_id(&header), Some("bwa".to_string()));
187    }
188
189    #[test]
190    fn test_get_last_program_id_chained() {
191        let mut header = Header::default();
192
193        // Add first program
194        let pg1 = Map::<Program>::default();
195        header.programs_mut().add(BString::from("bwa"), pg1).unwrap();
196
197        // Add second program that references the first
198        let pg2 =
199            Map::<Program>::builder().insert(tag::PREVIOUS_PROGRAM_ID, "bwa").build().unwrap();
200        header.programs_mut().add(BString::from("samtools"), pg2).unwrap();
201
202        // The last program should be samtools (not referenced by anyone)
203        assert_eq!(get_last_program_id(&header), Some("samtools".to_string()));
204    }
205
206    #[test]
207    fn test_make_unique_program_id_no_collision() {
208        let header = Header::default();
209        assert_eq!(make_unique_program_id(&header, "fgumi"), "fgumi");
210    }
211
212    #[test]
213    fn test_make_unique_program_id_with_collision() {
214        let mut header = Header::default();
215        let pg = Map::<Program>::default();
216        header.programs_mut().add(BString::from("fgumi"), pg).unwrap();
217
218        assert_eq!(make_unique_program_id(&header, "fgumi"), "fgumi.1");
219    }
220
221    #[test]
222    fn test_make_unique_program_id_multiple_collisions() {
223        let mut header = Header::default();
224
225        let pg1 = Map::<Program>::default();
226        header.programs_mut().add(BString::from("fgumi"), pg1).unwrap();
227
228        let pg2 = Map::<Program>::default();
229        header.programs_mut().add(BString::from("fgumi.1"), pg2).unwrap();
230
231        assert_eq!(make_unique_program_id(&header, "fgumi"), "fgumi.2");
232    }
233
234    #[test]
235    fn test_add_pg_record_empty_header() {
236        let header = Header::default();
237        let result = add_pg_record(header, "1.0.0", "fgumi test").unwrap();
238        let programs = result.programs();
239        assert_eq!(programs.as_ref().len(), 1);
240        assert!(programs.as_ref().contains_key(b"fgumi".as_slice()));
241
242        // Verify the program has expected fields
243        let pg = programs.as_ref().get(b"fgumi".as_slice()).unwrap();
244        assert_eq!(
245            pg.other_fields().get(&tag::NAME).map(std::convert::AsRef::as_ref),
246            Some(b"fgumi".as_slice())
247        );
248        assert_eq!(
249            pg.other_fields().get(&tag::VERSION).map(std::convert::AsRef::as_ref),
250            Some(b"1.0.0".as_slice())
251        );
252        assert_eq!(
253            pg.other_fields().get(&tag::COMMAND_LINE).map(std::convert::AsRef::as_ref),
254            Some(b"fgumi test".as_slice())
255        );
256        assert!(pg.other_fields().get(&tag::PREVIOUS_PROGRAM_ID).is_none());
257    }
258
259    #[test]
260    fn test_add_pg_record_with_existing_fgumi() {
261        let mut header = Header::default();
262        let pg = Map::<Program>::default();
263        header.programs_mut().add(BString::from("fgumi"), pg).unwrap();
264
265        let result = add_pg_record(header, "1.0.0", "fgumi test2").unwrap();
266        let programs = result.programs();
267        assert_eq!(programs.as_ref().len(), 2);
268        assert!(programs.as_ref().contains_key(b"fgumi.1".as_slice()));
269
270        // Verify PP chaining
271        let pg = programs.as_ref().get(b"fgumi.1".as_slice()).unwrap();
272        assert_eq!(
273            pg.other_fields().get(&tag::PREVIOUS_PROGRAM_ID).map(std::convert::AsRef::as_ref),
274            Some(b"fgumi".as_slice())
275        );
276    }
277
278    #[test]
279    fn test_add_pg_record_chains_to_non_fgumi() {
280        let mut header = Header::default();
281
282        // Add a BWA program first
283        let bwa_pg = Map::<Program>::builder()
284            .insert(tag::NAME, "bwa")
285            .insert(tag::VERSION, "0.7.17")
286            .build()
287            .unwrap();
288        header.programs_mut().add(BString::from("bwa"), bwa_pg).unwrap();
289
290        let result = add_pg_record(header, "1.0.0", "fgumi group -i in.bam").unwrap();
291        let programs = result.programs();
292
293        // fgumi should chain to bwa
294        let pg = programs.as_ref().get(b"fgumi".as_slice()).unwrap();
295        assert_eq!(
296            pg.other_fields().get(&tag::PREVIOUS_PROGRAM_ID).map(std::convert::AsRef::as_ref),
297            Some(b"bwa".as_slice())
298        );
299    }
300
301    #[test]
302    fn test_add_pg_to_builder() {
303        let builder = Header::builder();
304        let builder = add_pg_to_builder(builder, "1.0.0", "fgumi extract").unwrap();
305        let header = builder.build();
306
307        let programs = header.programs();
308        assert_eq!(programs.as_ref().len(), 1);
309
310        let pg = programs.as_ref().get(b"fgumi".as_slice()).unwrap();
311        assert_eq!(
312            pg.other_fields().get(&tag::NAME).map(std::convert::AsRef::as_ref),
313            Some(b"fgumi".as_slice())
314        );
315        assert!(pg.other_fields().get(&tag::PREVIOUS_PROGRAM_ID).is_none());
316    }
317
318    #[test]
319    fn test_add_pg_record_empty_command_line() {
320        let header = Header::default();
321        let result = add_pg_record(header, "1.0.0", "").unwrap();
322        let programs = result.programs();
323        assert_eq!(programs.as_ref().len(), 1);
324        assert!(programs.as_ref().contains_key(b"fgumi".as_slice()));
325    }
326
327    #[test]
328    fn test_add_pg_record_write_to_bam() {
329        use crate::bam_io::create_bam_writer;
330        use tempfile::TempDir;
331
332        let dir = TempDir::new().unwrap();
333        let output_path = dir.path().join("test.bam");
334
335        let header = Header::default();
336        let result = add_pg_record(header, "1.0.0", "fgumi test").unwrap();
337
338        // Try to write the header to a BAM file
339        let _writer = create_bam_writer(&output_path, &result, 1, 6).unwrap();
340    }
341
342    #[test]
343    fn test_add_pg_record_chains_to_empty_program() {
344        use crate::bam_io::create_bam_writer;
345        use tempfile::TempDir;
346
347        // Simulate what SamBuilder does - adds an empty/default program
348        let pg_map = Map::<Program>::default();
349        let header = Header::builder().add_program("SamBuilder", pg_map).build();
350
351        // Now add our fgumi @PG record
352        let result = add_pg_record(header, "1.0.0", "fgumi test").unwrap();
353        let programs = result.programs();
354        assert_eq!(programs.as_ref().len(), 2);
355
356        // fgumi should chain to SamBuilder
357        let pg = programs.as_ref().get(b"fgumi".as_slice()).unwrap();
358        assert_eq!(
359            pg.other_fields().get(&tag::PREVIOUS_PROGRAM_ID).map(std::convert::AsRef::as_ref),
360            Some(b"SamBuilder".as_slice())
361        );
362
363        // Try to write to BAM
364        let dir = TempDir::new().unwrap();
365        let output_path = dir.path().join("test.bam");
366        let _writer = create_bam_writer(&output_path, &result, 1, 6).unwrap();
367    }
368}