1#![deny(unsafe_code)]
2
3pub mod assigner;
11
12pub use assigner::{
14 AdjacencyUmiAssigner, IdentityUmiAssigner, PairedUmiAssigner, SimpleErrorUmiAssigner, Strategy,
15 Umi, UmiAssigner,
16};
17
18use std::collections::HashSet;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
26pub enum MoleculeId {
27 #[default]
29 None,
30 Single(u64),
32 PairedA(u64),
34 PairedB(u64),
36}
37
38impl MoleculeId {
39 #[inline]
41 #[must_use]
42 pub fn id(&self) -> Option<u64> {
43 match self {
44 MoleculeId::None => None,
45 MoleculeId::Single(id) | MoleculeId::PairedA(id) | MoleculeId::PairedB(id) => Some(*id),
46 }
47 }
48
49 #[inline]
51 #[must_use]
52 pub fn is_assigned(&self) -> bool {
53 !matches!(self, MoleculeId::None)
54 }
55
56 #[must_use]
61 pub fn to_string_with_offset(&self, base: u64) -> String {
62 let mut buf = String::new();
63 self.write_with_offset(base, &mut buf);
64 buf
65 }
66
67 pub fn write_with_offset<'a>(&self, base: u64, buf: &'a mut String) -> &'a [u8] {
71 use std::fmt::Write;
72 buf.clear();
73 match self {
74 MoleculeId::None => {}
75 MoleculeId::Single(id) => write!(buf, "{}", base + id).unwrap(),
76 MoleculeId::PairedA(id) => write!(buf, "{}/A", base + id).unwrap(),
77 MoleculeId::PairedB(id) => write!(buf, "{}/B", base + id).unwrap(),
78 }
79 buf.as_bytes()
80 }
81
82 #[inline]
92 #[must_use]
93 #[expect(clippy::cast_possible_truncation, reason = "molecule IDs never exceed usize::MAX / 2")]
94 pub fn to_vec_index(&self) -> Option<usize> {
95 match self {
96 MoleculeId::None => None,
97 MoleculeId::Single(id) => Some(*id as usize),
98 MoleculeId::PairedA(id) => Some(*id as usize * 2),
99 MoleculeId::PairedB(id) => Some(*id as usize * 2 + 1),
100 }
101 }
102
103 #[inline]
105 #[must_use]
106 pub fn is_paired(&self) -> bool {
107 matches!(self, MoleculeId::PairedA(_) | MoleculeId::PairedB(_))
108 }
109
110 #[inline]
115 #[must_use]
116 pub fn base_id_string(&self) -> String {
117 match self {
118 MoleculeId::None => String::new(),
119 MoleculeId::Single(id) | MoleculeId::PairedA(id) | MoleculeId::PairedB(id) => {
120 id.to_string()
121 }
122 }
123 }
124}
125
126impl std::fmt::Display for MoleculeId {
127 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128 match self {
129 MoleculeId::None => write!(f, ""),
130 MoleculeId::Single(id) => write!(f, "{id}"),
131 MoleculeId::PairedA(id) => write!(f, "{id}/A"),
132 MoleculeId::PairedB(id) => write!(f, "{id}/B"),
133 }
134 }
135}
136
137pub struct TagSets;
142
143impl TagSets {
144 pub const CONSENSUS_REVERSE: &[&str] = &["ad", "ae", "bd", "be", "cd"];
149
150 pub const CONSENSUS_REVCOMP: &[&str] = &["aD", "bD", "cD"];
155}
156
157#[derive(Debug, Clone)]
162pub struct TagInfo {
163 pub remove: HashSet<String>,
165 pub reverse: HashSet<String>,
167 pub revcomp: HashSet<String>,
169}
170
171impl TagInfo {
172 #[must_use]
194 pub fn new(remove: Vec<String>, reverse: Vec<String>, revcomp: Vec<String>) -> Self {
195 let mut reverse_set = HashSet::new();
196 let mut revcomp_set = HashSet::new();
197
198 for tag in reverse {
199 if tag == "Consensus" {
200 reverse_set.extend(TagSets::CONSENSUS_REVERSE.iter().map(|&s| s.to_owned()));
201 } else {
202 reverse_set.insert(tag);
203 }
204 }
205
206 for tag in revcomp {
207 if tag == "Consensus" {
208 revcomp_set.extend(TagSets::CONSENSUS_REVCOMP.iter().map(|&s| s.to_owned()));
209 } else {
210 revcomp_set.insert(tag);
211 }
212 }
213
214 TagInfo { remove: remove.into_iter().collect(), reverse: reverse_set, revcomp: revcomp_set }
215 }
216
217 #[must_use]
223 pub fn has_revs_or_revcomps(&self) -> bool {
224 !self.reverse.is_empty() || !self.revcomp.is_empty()
225 }
226}
227
228#[derive(Debug, Clone, Copy, PartialEq, Eq)]
232pub enum UmiValidation {
233 Valid(usize),
235 ContainsN,
237}
238
239#[must_use]
271pub fn validate_umi(umi: &[u8]) -> UmiValidation {
272 let mut base_count = 0usize;
273 for &b in umi {
274 match b {
275 b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't' => base_count += 1,
276 b'N' => return UmiValidation::ContainsN,
277 _ => {} }
279 }
280 UmiValidation::Valid(base_count)
281}
282
283#[must_use]
303pub fn extract_mi_base(mi: &str) -> &str {
304 if let Some(stripped) = mi.strip_suffix("/A") {
305 stripped
306 } else if let Some(stripped) = mi.strip_suffix("/B") {
307 stripped
308 } else {
309 mi
310 }
311}
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316
317 #[test]
318 fn test_consensus_reverse_returns_correct_tags() {
319 assert_eq!(TagSets::CONSENSUS_REVERSE.len(), 5);
320 assert_eq!(TagSets::CONSENSUS_REVERSE, &["ad", "ae", "bd", "be", "cd"]);
321 }
322
323 #[test]
324 fn test_consensus_revcomp_returns_correct_tags() {
325 assert_eq!(TagSets::CONSENSUS_REVCOMP.len(), 3);
326 assert_eq!(TagSets::CONSENSUS_REVCOMP, &["aD", "bD", "cD"]);
327 }
328
329 #[test]
330 fn test_taginfo_new_with_empty_lists() {
331 let tag_info = TagInfo::new(vec![], vec![], vec![]);
332 assert!(tag_info.remove.is_empty());
333 assert!(tag_info.reverse.is_empty());
334 assert!(tag_info.revcomp.is_empty());
335 }
336
337 #[test]
338 fn test_taginfo_new_with_individual_tags() {
339 let tag_info = TagInfo::new(
340 vec!["AS".to_string(), "NM".to_string()],
341 vec!["BQ".to_string()],
342 vec!["E2".to_string(), "U2".to_string()],
343 );
344 assert_eq!(tag_info.remove.len(), 2);
345 assert!(tag_info.remove.contains("AS"));
346 assert!(tag_info.remove.contains("NM"));
347 assert_eq!(tag_info.reverse.len(), 1);
348 assert!(tag_info.reverse.contains("BQ"));
349 assert_eq!(tag_info.revcomp.len(), 2);
350 assert!(tag_info.revcomp.contains("E2"));
351 assert!(tag_info.revcomp.contains("U2"));
352 }
353
354 #[test]
355 fn test_taginfo_new_with_consensus_reverse() {
356 let tag_info = TagInfo::new(vec![], vec!["Consensus".to_string()], vec![]);
357 assert_eq!(tag_info.reverse.len(), 5);
358 assert!(tag_info.reverse.contains("ad"));
359 assert!(tag_info.reverse.contains("ae"));
360 assert!(tag_info.reverse.contains("bd"));
361 assert!(tag_info.reverse.contains("be"));
362 assert!(tag_info.reverse.contains("cd"));
363 }
364
365 #[test]
366 fn test_taginfo_new_with_consensus_revcomp() {
367 let tag_info = TagInfo::new(vec![], vec![], vec!["Consensus".to_string()]);
368 assert_eq!(tag_info.revcomp.len(), 3);
369 assert!(tag_info.revcomp.contains("aD"));
370 assert!(tag_info.revcomp.contains("bD"));
371 assert!(tag_info.revcomp.contains("cD"));
372 }
373
374 #[test]
375 fn test_taginfo_new_with_consensus_and_individual_tags() {
376 let tag_info = TagInfo::new(
377 vec!["AS".to_string()],
378 vec!["Consensus".to_string(), "BQ".to_string()],
379 vec!["Consensus".to_string(), "E2".to_string()],
380 );
381 assert_eq!(tag_info.remove.len(), 1);
383 assert!(tag_info.remove.contains("AS"));
384 assert_eq!(tag_info.reverse.len(), 6);
386 assert!(tag_info.reverse.contains("ad"));
387 assert!(tag_info.reverse.contains("ae"));
388 assert!(tag_info.reverse.contains("bd"));
389 assert!(tag_info.reverse.contains("be"));
390 assert!(tag_info.reverse.contains("cd"));
391 assert!(tag_info.reverse.contains("BQ"));
392 assert_eq!(tag_info.revcomp.len(), 4);
394 assert!(tag_info.revcomp.contains("aD"));
395 assert!(tag_info.revcomp.contains("bD"));
396 assert!(tag_info.revcomp.contains("cD"));
397 assert!(tag_info.revcomp.contains("E2"));
398 }
399
400 #[test]
401 fn test_taginfo_new_with_duplicate_tags() {
402 let tag_info = TagInfo::new(
403 vec!["AS".to_string(), "AS".to_string()],
404 vec!["BQ".to_string(), "BQ".to_string()],
405 vec!["E2".to_string(), "E2".to_string()],
406 );
407 assert_eq!(tag_info.remove.len(), 1);
409 assert_eq!(tag_info.reverse.len(), 1);
410 assert_eq!(tag_info.revcomp.len(), 1);
411 }
412
413 #[test]
414 fn test_taginfo_new_with_multiple_consensus_references() {
415 let tag_info = TagInfo::new(
416 vec![],
417 vec!["Consensus".to_string(), "Consensus".to_string()],
418 vec!["Consensus".to_string(), "Consensus".to_string()],
419 );
420 assert_eq!(tag_info.reverse.len(), 5);
422 assert_eq!(tag_info.revcomp.len(), 3);
423 }
424
425 #[test]
426 fn test_has_revs_or_revcomps_with_both_empty() {
427 let tag_info = TagInfo::new(vec!["AS".to_string()], vec![], vec![]);
428 assert!(!tag_info.has_revs_or_revcomps());
429 }
430
431 #[test]
432 fn test_has_revs_or_revcomps_with_reverse_only() {
433 let tag_info = TagInfo::new(vec![], vec!["BQ".to_string()], vec![]);
434 assert!(tag_info.has_revs_or_revcomps());
435 }
436
437 #[test]
438 fn test_has_revs_or_revcomps_with_revcomp_only() {
439 let tag_info = TagInfo::new(vec![], vec![], vec!["E2".to_string()]);
440 assert!(tag_info.has_revs_or_revcomps());
441 }
442
443 #[test]
444 fn test_has_revs_or_revcomps_with_both() {
445 let tag_info = TagInfo::new(vec![], vec!["BQ".to_string()], vec!["E2".to_string()]);
446 assert!(tag_info.has_revs_or_revcomps());
447 }
448
449 #[test]
450 fn test_has_revs_or_revcomps_with_consensus() {
451 let tag_info = TagInfo::new(vec![], vec!["Consensus".to_string()], vec![]);
452 assert!(tag_info.has_revs_or_revcomps());
453 }
454
455 #[test]
456 fn test_taginfo_clone() {
457 let tag_info =
458 TagInfo::new(vec!["AS".to_string()], vec!["BQ".to_string()], vec!["E2".to_string()]);
459 let cloned = tag_info.clone();
460 assert_eq!(cloned.remove.len(), tag_info.remove.len());
461 assert_eq!(cloned.reverse.len(), tag_info.reverse.len());
462 assert_eq!(cloned.revcomp.len(), tag_info.revcomp.len());
463 }
464
465 #[test]
466 fn test_extract_mi_base_simple() {
467 assert_eq!(extract_mi_base("123"), "123");
468 assert_eq!(extract_mi_base("A"), "A");
469 }
470
471 #[test]
472 fn test_extract_mi_base_with_duplex_suffix() {
473 assert_eq!(extract_mi_base("123/A"), "123");
475 assert_eq!(extract_mi_base("123/B"), "123");
476 assert_eq!(extract_mi_base("123/456/A"), "123/456");
477 assert_eq!(extract_mi_base("123/C"), "123/C");
479 assert_eq!(extract_mi_base("123/ReallyLongSuffix"), "123/ReallyLongSuffix");
480 }
481
482 #[test]
487 fn test_validate_umi_uppercase_acgt() {
488 assert_eq!(validate_umi(b"ACGT"), UmiValidation::Valid(4));
489 assert_eq!(validate_umi(b"AAAAAAAA"), UmiValidation::Valid(8));
490 assert_eq!(validate_umi(b"TTTTTTTT"), UmiValidation::Valid(8));
491 }
492
493 #[test]
494 fn test_validate_umi_lowercase_acgt() {
495 assert_eq!(validate_umi(b"acgt"), UmiValidation::Valid(4));
497 assert_eq!(validate_umi(b"AcGt"), UmiValidation::Valid(4));
498 }
499
500 #[test]
501 fn test_validate_umi_uppercase_n_rejected() {
502 assert_eq!(validate_umi(b"ACNT"), UmiValidation::ContainsN);
504 assert_eq!(validate_umi(b"NACGT"), UmiValidation::ContainsN);
505 assert_eq!(validate_umi(b"ACGTN"), UmiValidation::ContainsN);
506 assert_eq!(validate_umi(b"NNNN"), UmiValidation::ContainsN);
507 }
508
509 #[test]
510 fn test_validate_umi_lowercase_n_skipped() {
511 assert_eq!(validate_umi(b"ACnT"), UmiValidation::Valid(3));
514 assert_eq!(validate_umi(b"acnt"), UmiValidation::Valid(3));
515 assert_eq!(validate_umi(b"nnnn"), UmiValidation::Valid(0));
516 }
517
518 #[test]
519 fn test_validate_umi_dash_skipped() {
520 assert_eq!(validate_umi(b"ACGT-TGCA"), UmiValidation::Valid(8));
522 assert_eq!(validate_umi(b"----"), UmiValidation::Valid(0));
523 }
524
525 #[test]
526 fn test_validate_umi_other_chars_skipped() {
527 assert_eq!(validate_umi(b"ACGT+TGCA"), UmiValidation::Valid(8));
529 assert_eq!(validate_umi(b"AC GT"), UmiValidation::Valid(4));
530 }
531
532 #[test]
533 fn test_validate_umi_empty() {
534 assert_eq!(validate_umi(b""), UmiValidation::Valid(0));
535 }
536
537 #[test]
538 fn test_validate_umi_mixed_case_with_uppercase_n() {
539 assert_eq!(validate_umi(b"acNt"), UmiValidation::ContainsN);
541 assert_eq!(validate_umi(b"AcNt"), UmiValidation::ContainsN);
542 }
543}