Skip to main content

fgumi_umi/
lib.rs

1#![deny(unsafe_code)]
2
3//! UMI (Unique Molecular Identifier) utilities
4//!
5//! This crate provides functionality for working with UMIs including:
6//! - UMI assignment strategies (identity, edit-distance, adjacency, paired)
7//! - Tag set management for consensus calling
8//! - UMI grouping and family analysis
9
10pub mod assigner;
11
12// Re-export commonly used items
13pub use assigner::{
14    AdjacencyUmiAssigner, IdentityUmiAssigner, PairedUmiAssigner, SimpleErrorUmiAssigner, Strategy,
15    Umi, UmiAssigner,
16};
17
18use std::collections::HashSet;
19
20/// Molecule identifier for UMI grouping.
21///
22/// Represents the assigned molecule ID during UMI-based grouping. The ID is stored
23/// as an integer during processing for efficiency, and only converted to a string
24/// when writing the final BAM output.
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
26pub enum MoleculeId {
27    /// Not yet assigned (default state)
28    #[default]
29    None,
30    /// Non-paired strategy: simple integer ID (e.g., "42")
31    Single(u64),
32    /// Paired strategy, top strand (e.g., "42/A")
33    PairedA(u64),
34    /// Paired strategy, bottom strand (e.g., "42/B")
35    PairedB(u64),
36}
37
38impl MoleculeId {
39    /// Get the base molecule ID value, if assigned.
40    #[inline]
41    #[must_use]
42    pub fn id(&self) -> Option<u64> {
43        match self {
44            MoleculeId::None => None,
45            MoleculeId::Single(id) | MoleculeId::PairedA(id) | MoleculeId::PairedB(id) => Some(*id),
46        }
47    }
48
49    /// Check if a molecule ID has been assigned.
50    #[inline]
51    #[must_use]
52    pub fn is_assigned(&self) -> bool {
53        !matches!(self, MoleculeId::None)
54    }
55
56    /// Convert to string representation with a base offset applied.
57    ///
58    /// Used when serializing to BAM output, where local IDs (0, 1, 2, ...)
59    /// need to be converted to global IDs by adding the base offset.
60    #[must_use]
61    pub fn to_string_with_offset(&self, base: u64) -> String {
62        let mut buf = String::new();
63        self.write_with_offset(base, &mut buf);
64        buf
65    }
66
67    /// Write string representation into a reusable buffer, returning the bytes.
68    ///
69    /// Avoids per-call String allocation by reusing the caller's buffer.
70    pub fn write_with_offset<'a>(&self, base: u64, buf: &'a mut String) -> &'a [u8] {
71        use std::fmt::Write;
72        buf.clear();
73        match self {
74            MoleculeId::None => {}
75            MoleculeId::Single(id) => write!(buf, "{}", base + id).unwrap(),
76            MoleculeId::PairedA(id) => write!(buf, "{}/A", base + id).unwrap(),
77            MoleculeId::PairedB(id) => write!(buf, "{}/B", base + id).unwrap(),
78        }
79        buf.as_bytes()
80    }
81
82    /// Convert to a Vec index for grouping templates by molecule ID.
83    ///
84    /// Returns `None` for unassigned `MoleculeId`s. For assigned IDs:
85    /// - `Single(id)`: returns id (indices: 0, 1, 2, ...)
86    /// - `PairedA(id)`: returns id * 2 (indices: 0, 2, 4, ...)
87    /// - `PairedB(id)`: returns id * 2 + 1 (indices: 1, 3, 5, ...)
88    ///
89    /// This preserves the sort order (A before B for same base id) and allows
90    /// efficient Vec-based grouping instead of `HashMap`.
91    #[inline]
92    #[must_use]
93    #[expect(clippy::cast_possible_truncation, reason = "molecule IDs never exceed usize::MAX / 2")]
94    pub fn to_vec_index(&self) -> Option<usize> {
95        match self {
96            MoleculeId::None => None,
97            MoleculeId::Single(id) => Some(*id as usize),
98            MoleculeId::PairedA(id) => Some(*id as usize * 2),
99            MoleculeId::PairedB(id) => Some(*id as usize * 2 + 1),
100        }
101    }
102
103    /// Check if this is a paired molecule ID (`PairedA` or `PairedB`).
104    #[inline]
105    #[must_use]
106    pub fn is_paired(&self) -> bool {
107        matches!(self, MoleculeId::PairedA(_) | MoleculeId::PairedB(_))
108    }
109
110    /// Get the base ID without suffix for grouping purposes.
111    ///
112    /// For paired `MoleculeId`s, this returns the base ID (without /A or /B).
113    /// Used when grouping templates that share the same molecule but different strands.
114    #[inline]
115    #[must_use]
116    pub fn base_id_string(&self) -> String {
117        match self {
118            MoleculeId::None => String::new(),
119            MoleculeId::Single(id) | MoleculeId::PairedA(id) | MoleculeId::PairedB(id) => {
120                id.to_string()
121            }
122        }
123    }
124}
125
126impl std::fmt::Display for MoleculeId {
127    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128        match self {
129            MoleculeId::None => write!(f, ""),
130            MoleculeId::Single(id) => write!(f, "{id}"),
131            MoleculeId::PairedA(id) => write!(f, "{id}/A"),
132            MoleculeId::PairedB(id) => write!(f, "{id}/B"),
133        }
134    }
135}
136
137/// Named sets of tags for common use cases
138///
139/// Provides predefined tag sets that can be referenced by name (e.g., "Consensus")
140/// instead of listing individual tags.
141pub struct TagSets;
142
143impl TagSets {
144    /// Consensus tags that should be reversed.
145    ///
146    /// These are per-base tags from fgbio consensus callers that need to be
147    /// reversed when the read is mapped to the negative strand.
148    pub const CONSENSUS_REVERSE: &[&str] = &["ad", "ae", "bd", "be", "cd"];
149
150    /// Consensus tags that should be reverse complemented.
151    ///
152    /// These are per-base sequence tags from fgbio consensus callers that need
153    /// to be reverse complemented when the read is mapped to the negative strand.
154    pub const CONSENSUS_REVCOMP: &[&str] = &["aD", "bD", "cD"];
155}
156
157/// Information about which tags to remove, reverse, or reverse complement
158///
159/// Stores sets of tag names that should be manipulated during the merge operation.
160/// Named tag sets (like "Consensus") are expanded into their constituent tags.
161#[derive(Debug, Clone)]
162pub struct TagInfo {
163    /// Tags to remove from mapped reads
164    pub remove: HashSet<String>,
165    /// Tags to reverse for negative strand reads
166    pub reverse: HashSet<String>,
167    /// Tags to reverse complement for negative strand reads
168    pub revcomp: HashSet<String>,
169}
170
171impl TagInfo {
172    /// Creates a new `TagInfo` from lists of tag names
173    ///
174    /// Expands named tag sets (e.g., "Consensus") into their constituent tags.
175    /// Individual tag names are added as-is.
176    ///
177    /// # Arguments
178    ///
179    /// * `remove` - Tags to remove from mapped reads
180    /// * `reverse` - Tags to reverse for negative strand (or "Consensus")
181    /// * `revcomp` - Tags to reverse complement for negative strand (or "Consensus")
182    ///
183    /// # Examples
184    ///
185    /// ```
186    /// use fgumi_umi::TagInfo;
187    /// let tag_info = TagInfo::new(
188    ///     vec!["AS".to_string()],
189    ///     vec!["Consensus".to_string()],
190    ///     vec!["Consensus".to_string()],
191    /// );
192    /// ```
193    #[must_use]
194    pub fn new(remove: Vec<String>, reverse: Vec<String>, revcomp: Vec<String>) -> Self {
195        let mut reverse_set = HashSet::new();
196        let mut revcomp_set = HashSet::new();
197
198        for tag in reverse {
199            if tag == "Consensus" {
200                reverse_set.extend(TagSets::CONSENSUS_REVERSE.iter().map(|&s| s.to_owned()));
201            } else {
202                reverse_set.insert(tag);
203            }
204        }
205
206        for tag in revcomp {
207            if tag == "Consensus" {
208                revcomp_set.extend(TagSets::CONSENSUS_REVCOMP.iter().map(|&s| s.to_owned()));
209            } else {
210                revcomp_set.insert(tag);
211            }
212        }
213
214        TagInfo { remove: remove.into_iter().collect(), reverse: reverse_set, revcomp: revcomp_set }
215    }
216
217    /// Checks if any tags need reversal or reverse complementation
218    ///
219    /// # Returns
220    ///
221    /// `true` if there are any tags to reverse or revcomp, `false` otherwise
222    #[must_use]
223    pub fn has_revs_or_revcomps(&self) -> bool {
224        !self.reverse.is_empty() || !self.revcomp.is_empty()
225    }
226}
227
228/// Result of validating a UMI string.
229///
230/// Used by both `group` and `dedup` commands to consistently filter UMIs.
231#[derive(Debug, Clone, Copy, PartialEq, Eq)]
232pub enum UmiValidation {
233    /// UMI is valid with the given number of bases (ACGT, case-insensitive)
234    Valid(usize),
235    /// UMI contains 'N' (uppercase only, matching fgbio behavior)
236    ContainsN,
237}
238
239/// Validates a UMI string, matching fgbio's behavior.
240///
241/// This function:
242/// - Counts valid DNA bases (A, C, G, T - case insensitive)
243/// - Rejects UMIs containing uppercase 'N' (ambiguous base)
244/// - Skips dashes (paired UMI separator) and other characters including lowercase 'n'
245///
246/// This matches fgbio's `GroupReadsByUmi` which:
247/// 1. Only rejects uppercase 'N': `.filter(r => !umi.contains('N'))`
248/// 2. Counts bases case-insensitively: `umi.toUpperCase.count(c => isUpperACGTN(c))`
249///
250/// # Arguments
251///
252/// * `umi` - The UMI string to validate
253///
254/// # Returns
255///
256/// * `UmiValidation::Valid(count)` - UMI is valid with `count` DNA bases
257/// * `UmiValidation::ContainsN` - UMI contains uppercase 'N'
258///
259/// # Examples
260///
261/// ```
262/// use fgumi_umi::{validate_umi, UmiValidation};
263///
264/// assert_eq!(validate_umi(b"ACGT"), UmiValidation::Valid(4));
265/// assert_eq!(validate_umi(b"acgt"), UmiValidation::Valid(4));
266/// assert_eq!(validate_umi(b"ACGT-TGCA"), UmiValidation::Valid(8));
267/// assert_eq!(validate_umi(b"ACNT"), UmiValidation::ContainsN);
268/// assert_eq!(validate_umi(b"acnt"), UmiValidation::Valid(3)); // lowercase 'n' skipped
269/// ```
270#[must_use]
271pub fn validate_umi(umi: &[u8]) -> UmiValidation {
272    let mut base_count = 0usize;
273    for &b in umi {
274        match b {
275            b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't' => base_count += 1,
276            b'N' => return UmiValidation::ContainsN,
277            _ => {} // Skip dash, lowercase 'n', other chars (matches fgbio)
278        }
279    }
280    UmiValidation::Valid(base_count)
281}
282
283/// Extracts the molecular identifier base, removing any trailing /A, /B suffix.
284///
285/// MI tags in duplex sequencing have suffixes `/A` or `/B` to indicate
286/// which strand the read came from. This function strips those specific suffixes
287/// to get the base molecular identifier for grouping purposes.
288///
289/// Only `/A` and `/B` suffixes are stripped; other suffixes are preserved.
290///
291/// # Examples
292///
293/// ```
294/// use fgumi_umi::extract_mi_base;
295///
296/// assert_eq!(extract_mi_base("123"), "123");
297/// assert_eq!(extract_mi_base("123/A"), "123");
298/// assert_eq!(extract_mi_base("123/B"), "123");
299/// assert_eq!(extract_mi_base("abc/456/A"), "abc/456");
300/// assert_eq!(extract_mi_base("123/C"), "123/C");  // not stripped
301/// ```
302#[must_use]
303pub fn extract_mi_base(mi: &str) -> &str {
304    if let Some(stripped) = mi.strip_suffix("/A") {
305        stripped
306    } else if let Some(stripped) = mi.strip_suffix("/B") {
307        stripped
308    } else {
309        mi
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    #[test]
318    fn test_consensus_reverse_returns_correct_tags() {
319        assert_eq!(TagSets::CONSENSUS_REVERSE.len(), 5);
320        assert_eq!(TagSets::CONSENSUS_REVERSE, &["ad", "ae", "bd", "be", "cd"]);
321    }
322
323    #[test]
324    fn test_consensus_revcomp_returns_correct_tags() {
325        assert_eq!(TagSets::CONSENSUS_REVCOMP.len(), 3);
326        assert_eq!(TagSets::CONSENSUS_REVCOMP, &["aD", "bD", "cD"]);
327    }
328
329    #[test]
330    fn test_taginfo_new_with_empty_lists() {
331        let tag_info = TagInfo::new(vec![], vec![], vec![]);
332        assert!(tag_info.remove.is_empty());
333        assert!(tag_info.reverse.is_empty());
334        assert!(tag_info.revcomp.is_empty());
335    }
336
337    #[test]
338    fn test_taginfo_new_with_individual_tags() {
339        let tag_info = TagInfo::new(
340            vec!["AS".to_string(), "NM".to_string()],
341            vec!["BQ".to_string()],
342            vec!["E2".to_string(), "U2".to_string()],
343        );
344        assert_eq!(tag_info.remove.len(), 2);
345        assert!(tag_info.remove.contains("AS"));
346        assert!(tag_info.remove.contains("NM"));
347        assert_eq!(tag_info.reverse.len(), 1);
348        assert!(tag_info.reverse.contains("BQ"));
349        assert_eq!(tag_info.revcomp.len(), 2);
350        assert!(tag_info.revcomp.contains("E2"));
351        assert!(tag_info.revcomp.contains("U2"));
352    }
353
354    #[test]
355    fn test_taginfo_new_with_consensus_reverse() {
356        let tag_info = TagInfo::new(vec![], vec!["Consensus".to_string()], vec![]);
357        assert_eq!(tag_info.reverse.len(), 5);
358        assert!(tag_info.reverse.contains("ad"));
359        assert!(tag_info.reverse.contains("ae"));
360        assert!(tag_info.reverse.contains("bd"));
361        assert!(tag_info.reverse.contains("be"));
362        assert!(tag_info.reverse.contains("cd"));
363    }
364
365    #[test]
366    fn test_taginfo_new_with_consensus_revcomp() {
367        let tag_info = TagInfo::new(vec![], vec![], vec!["Consensus".to_string()]);
368        assert_eq!(tag_info.revcomp.len(), 3);
369        assert!(tag_info.revcomp.contains("aD"));
370        assert!(tag_info.revcomp.contains("bD"));
371        assert!(tag_info.revcomp.contains("cD"));
372    }
373
374    #[test]
375    fn test_taginfo_new_with_consensus_and_individual_tags() {
376        let tag_info = TagInfo::new(
377            vec!["AS".to_string()],
378            vec!["Consensus".to_string(), "BQ".to_string()],
379            vec!["Consensus".to_string(), "E2".to_string()],
380        );
381        // Remove set
382        assert_eq!(tag_info.remove.len(), 1);
383        assert!(tag_info.remove.contains("AS"));
384        // Reverse set should have Consensus tags + BQ
385        assert_eq!(tag_info.reverse.len(), 6);
386        assert!(tag_info.reverse.contains("ad"));
387        assert!(tag_info.reverse.contains("ae"));
388        assert!(tag_info.reverse.contains("bd"));
389        assert!(tag_info.reverse.contains("be"));
390        assert!(tag_info.reverse.contains("cd"));
391        assert!(tag_info.reverse.contains("BQ"));
392        // Revcomp set should have Consensus tags + E2
393        assert_eq!(tag_info.revcomp.len(), 4);
394        assert!(tag_info.revcomp.contains("aD"));
395        assert!(tag_info.revcomp.contains("bD"));
396        assert!(tag_info.revcomp.contains("cD"));
397        assert!(tag_info.revcomp.contains("E2"));
398    }
399
400    #[test]
401    fn test_taginfo_new_with_duplicate_tags() {
402        let tag_info = TagInfo::new(
403            vec!["AS".to_string(), "AS".to_string()],
404            vec!["BQ".to_string(), "BQ".to_string()],
405            vec!["E2".to_string(), "E2".to_string()],
406        );
407        // HashSet should deduplicate
408        assert_eq!(tag_info.remove.len(), 1);
409        assert_eq!(tag_info.reverse.len(), 1);
410        assert_eq!(tag_info.revcomp.len(), 1);
411    }
412
413    #[test]
414    fn test_taginfo_new_with_multiple_consensus_references() {
415        let tag_info = TagInfo::new(
416            vec![],
417            vec!["Consensus".to_string(), "Consensus".to_string()],
418            vec!["Consensus".to_string(), "Consensus".to_string()],
419        );
420        // Should not duplicate consensus tags
421        assert_eq!(tag_info.reverse.len(), 5);
422        assert_eq!(tag_info.revcomp.len(), 3);
423    }
424
425    #[test]
426    fn test_has_revs_or_revcomps_with_both_empty() {
427        let tag_info = TagInfo::new(vec!["AS".to_string()], vec![], vec![]);
428        assert!(!tag_info.has_revs_or_revcomps());
429    }
430
431    #[test]
432    fn test_has_revs_or_revcomps_with_reverse_only() {
433        let tag_info = TagInfo::new(vec![], vec!["BQ".to_string()], vec![]);
434        assert!(tag_info.has_revs_or_revcomps());
435    }
436
437    #[test]
438    fn test_has_revs_or_revcomps_with_revcomp_only() {
439        let tag_info = TagInfo::new(vec![], vec![], vec!["E2".to_string()]);
440        assert!(tag_info.has_revs_or_revcomps());
441    }
442
443    #[test]
444    fn test_has_revs_or_revcomps_with_both() {
445        let tag_info = TagInfo::new(vec![], vec!["BQ".to_string()], vec!["E2".to_string()]);
446        assert!(tag_info.has_revs_or_revcomps());
447    }
448
449    #[test]
450    fn test_has_revs_or_revcomps_with_consensus() {
451        let tag_info = TagInfo::new(vec![], vec!["Consensus".to_string()], vec![]);
452        assert!(tag_info.has_revs_or_revcomps());
453    }
454
455    #[test]
456    fn test_taginfo_clone() {
457        let tag_info =
458            TagInfo::new(vec!["AS".to_string()], vec!["BQ".to_string()], vec!["E2".to_string()]);
459        let cloned = tag_info.clone();
460        assert_eq!(cloned.remove.len(), tag_info.remove.len());
461        assert_eq!(cloned.reverse.len(), tag_info.reverse.len());
462        assert_eq!(cloned.revcomp.len(), tag_info.revcomp.len());
463    }
464
465    #[test]
466    fn test_extract_mi_base_simple() {
467        assert_eq!(extract_mi_base("123"), "123");
468        assert_eq!(extract_mi_base("A"), "A");
469    }
470
471    #[test]
472    fn test_extract_mi_base_with_duplex_suffix() {
473        // Only /A and /B suffixes are stripped
474        assert_eq!(extract_mi_base("123/A"), "123");
475        assert_eq!(extract_mi_base("123/B"), "123");
476        assert_eq!(extract_mi_base("123/456/A"), "123/456");
477        // Other suffixes are preserved
478        assert_eq!(extract_mi_base("123/C"), "123/C");
479        assert_eq!(extract_mi_base("123/ReallyLongSuffix"), "123/ReallyLongSuffix");
480    }
481
482    // =========================================================================
483    // validate_umi tests - matching fgbio behavior
484    // =========================================================================
485
486    #[test]
487    fn test_validate_umi_uppercase_acgt() {
488        assert_eq!(validate_umi(b"ACGT"), UmiValidation::Valid(4));
489        assert_eq!(validate_umi(b"AAAAAAAA"), UmiValidation::Valid(8));
490        assert_eq!(validate_umi(b"TTTTTTTT"), UmiValidation::Valid(8));
491    }
492
493    #[test]
494    fn test_validate_umi_lowercase_acgt() {
495        // Lowercase bases should be counted (matches fgbio's toUpperCase behavior)
496        assert_eq!(validate_umi(b"acgt"), UmiValidation::Valid(4));
497        assert_eq!(validate_umi(b"AcGt"), UmiValidation::Valid(4));
498    }
499
500    #[test]
501    fn test_validate_umi_uppercase_n_rejected() {
502        // Uppercase N should be rejected (matches fgbio's .contains('N'))
503        assert_eq!(validate_umi(b"ACNT"), UmiValidation::ContainsN);
504        assert_eq!(validate_umi(b"NACGT"), UmiValidation::ContainsN);
505        assert_eq!(validate_umi(b"ACGTN"), UmiValidation::ContainsN);
506        assert_eq!(validate_umi(b"NNNN"), UmiValidation::ContainsN);
507    }
508
509    #[test]
510    fn test_validate_umi_lowercase_n_skipped() {
511        // Lowercase 'n' should be skipped, not rejected (matches fgbio)
512        // fgbio's .contains('N') is case-sensitive
513        assert_eq!(validate_umi(b"ACnT"), UmiValidation::Valid(3));
514        assert_eq!(validate_umi(b"acnt"), UmiValidation::Valid(3));
515        assert_eq!(validate_umi(b"nnnn"), UmiValidation::Valid(0));
516    }
517
518    #[test]
519    fn test_validate_umi_dash_skipped() {
520        // Dash is skipped for paired UMIs
521        assert_eq!(validate_umi(b"ACGT-TGCA"), UmiValidation::Valid(8));
522        assert_eq!(validate_umi(b"----"), UmiValidation::Valid(0));
523    }
524
525    #[test]
526    fn test_validate_umi_other_chars_skipped() {
527        // Other characters are silently skipped
528        assert_eq!(validate_umi(b"ACGT+TGCA"), UmiValidation::Valid(8));
529        assert_eq!(validate_umi(b"AC GT"), UmiValidation::Valid(4));
530    }
531
532    #[test]
533    fn test_validate_umi_empty() {
534        assert_eq!(validate_umi(b""), UmiValidation::Valid(0));
535    }
536
537    #[test]
538    fn test_validate_umi_mixed_case_with_uppercase_n() {
539        // Mixed case with uppercase N should still be rejected
540        assert_eq!(validate_umi(b"acNt"), UmiValidation::ContainsN);
541        assert_eq!(validate_umi(b"AcNt"), UmiValidation::ContainsN);
542    }
543}