#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum UmiLocation {
#[default]
Read1,
Read2,
Index,
PerIndex,
}
impl UmiLocation {
pub fn from_str(s: &str) -> Option<Self> {
match s.to_lowercase().as_str() {
"read1" => Some(UmiLocation::Read1),
"read2" => Some(UmiLocation::Read2),
"index" => Some(UmiLocation::Index),
"per_index" | "perindex" => Some(UmiLocation::PerIndex),
_ => None,
}
}
pub fn as_str(&self) -> &'static str {
match self {
UmiLocation::Read1 => "read1",
UmiLocation::Read2 => "read2",
UmiLocation::Index => "index",
UmiLocation::PerIndex => "per_index",
}
}
}
#[derive(Debug, Clone)]
pub struct UmiConfig {
pub enabled: bool,
pub location: UmiLocation,
pub length: usize,
pub prefix: String,
pub skip: usize,
pub separator: String,
}
impl Default for UmiConfig {
fn default() -> Self {
Self {
enabled: false,
location: UmiLocation::Read1,
length: 0,
prefix: String::new(),
skip: 0,
separator: ":".to_string(),
}
}
}
impl UmiConfig {
pub fn new() -> Self {
Self::default()
}
pub fn disabled() -> Self {
Self::default()
}
pub fn enabled(mut self) -> Self {
self.enabled = true;
self
}
pub fn with_location(mut self, location: UmiLocation) -> Self {
self.location = location;
self
}
pub fn with_length(mut self, length: usize) -> Self {
self.length = length;
self.enabled = length > 0;
self
}
pub fn with_prefix(mut self, prefix: impl Into<String>) -> Self {
self.prefix = prefix.into();
self
}
pub fn with_skip(mut self, skip: usize) -> Self {
self.skip = skip;
self
}
pub fn with_separator(mut self, separator: impl Into<String>) -> Self {
self.separator = separator.into();
self
}
#[inline]
pub fn is_enabled(&self) -> bool {
self.enabled && self.length > 0
}
#[inline]
pub fn total_consumed(&self) -> usize {
self.skip + self.length
}
}
#[derive(Debug, Clone)]
pub struct UmiProcessor {
config: UmiConfig,
}
impl UmiProcessor {
pub fn new(config: UmiConfig) -> Self {
Self { config }
}
#[inline]
pub fn config(&self) -> &UmiConfig {
&self.config
}
#[inline]
pub fn is_enabled(&self) -> bool {
self.config.is_enabled()
}
pub fn extract_umi(&self, seq: &[u8], index_seq: Option<&[u8]>) -> Option<Vec<u8>> {
if !self.config.is_enabled() {
return None;
}
let source = match self.config.location {
UmiLocation::Read1 | UmiLocation::Read2 => seq,
UmiLocation::Index | UmiLocation::PerIndex => index_seq?,
};
self.extract_from_sequence(source)
}
#[inline]
fn extract_from_sequence(&self, seq: &[u8]) -> Option<Vec<u8>> {
let start = self.config.skip;
let end = start + self.config.length;
if end > seq.len() {
return None;
}
Some(seq[start..end].to_vec())
}
pub fn add_umi_to_name(&self, name: &[u8], umi: &[u8]) -> Vec<u8> {
let mut new_name = Vec::with_capacity(
name.len() + self.config.separator.len() + self.config.prefix.len() + umi.len(),
);
new_name.extend_from_slice(name);
new_name.extend_from_slice(self.config.separator.as_bytes());
new_name.extend_from_slice(self.config.prefix.as_bytes());
new_name.extend_from_slice(umi);
new_name
}
pub fn process_read(
&self,
name: &[u8],
seq: &[u8],
qual: &[u8],
index_seq: Option<&[u8]>,
) -> Option<(Vec<u8>, Vec<u8>, Vec<u8>)> {
if !self.config.is_enabled() {
return Some((name.to_vec(), seq.to_vec(), qual.to_vec()));
}
let umi = self.extract_umi(seq, index_seq)?;
let new_name = self.add_umi_to_name(name, &umi);
let (new_seq, new_qual) = match self.config.location {
UmiLocation::Read1 | UmiLocation::Read2 => {
let consumed = self.config.total_consumed();
if consumed > seq.len() || consumed > qual.len() {
return None;
}
(seq[consumed..].to_vec(), qual[consumed..].to_vec())
}
UmiLocation::Index | UmiLocation::PerIndex => (seq.to_vec(), qual.to_vec()),
};
Some((new_name, new_seq, new_qual))
}
#[allow(clippy::type_complexity)]
pub fn process_paired_reads(
&self,
r1_name: &[u8],
r1_seq: &[u8],
r1_qual: &[u8],
r2_name: &[u8],
r2_seq: &[u8],
r2_qual: &[u8],
index_seq: Option<&[u8]>,
) -> Option<((Vec<u8>, Vec<u8>, Vec<u8>), (Vec<u8>, Vec<u8>, Vec<u8>))> {
if !self.config.is_enabled() {
return Some((
(r1_name.to_vec(), r1_seq.to_vec(), r1_qual.to_vec()),
(r2_name.to_vec(), r2_seq.to_vec(), r2_qual.to_vec()),
));
}
match self.config.location {
UmiLocation::Read1 => {
let umi = self.extract_from_sequence(r1_seq)?;
let new_r1_name = self.add_umi_to_name(r1_name, &umi);
let new_r2_name = self.add_umi_to_name(r2_name, &umi);
let consumed = self.config.total_consumed();
if consumed > r1_seq.len() || consumed > r1_qual.len() {
return None;
}
Some((
(new_r1_name, r1_seq[consumed..].to_vec(), r1_qual[consumed..].to_vec()),
(new_r2_name, r2_seq.to_vec(), r2_qual.to_vec()),
))
}
UmiLocation::Read2 => {
let umi = self.extract_from_sequence(r2_seq)?;
let new_r1_name = self.add_umi_to_name(r1_name, &umi);
let new_r2_name = self.add_umi_to_name(r2_name, &umi);
let consumed = self.config.total_consumed();
if consumed > r2_seq.len() || consumed > r2_qual.len() {
return None;
}
Some((
(new_r1_name, r1_seq.to_vec(), r1_qual.to_vec()),
(new_r2_name, r2_seq[consumed..].to_vec(), r2_qual[consumed..].to_vec()),
))
}
UmiLocation::Index | UmiLocation::PerIndex => {
let umi = self.extract_from_sequence(index_seq?)?;
let new_r1_name = self.add_umi_to_name(r1_name, &umi);
let new_r2_name = self.add_umi_to_name(r2_name, &umi);
Some((
(new_r1_name, r1_seq.to_vec(), r1_qual.to_vec()),
(new_r2_name, r2_seq.to_vec(), r2_qual.to_vec()),
))
}
}
}
#[inline]
pub fn trim_start(&self) -> usize {
if !self.config.is_enabled() {
return 0;
}
match self.config.location {
UmiLocation::Read1 | UmiLocation::Read2 => self.config.total_consumed(),
UmiLocation::Index | UmiLocation::PerIndex => 0,
}
}
}
pub fn parse_umi_location(s: &str) -> Result<UmiLocation, String> {
UmiLocation::from_str(s).ok_or_else(|| {
format!(
"Invalid UMI location '{}'. Valid options: read1, read2, index, per_index",
s
)
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_umi_location_default() {
assert_eq!(UmiLocation::default(), UmiLocation::Read1);
}
#[test]
fn test_umi_location_from_str() {
assert_eq!(UmiLocation::from_str("read1"), Some(UmiLocation::Read1));
assert_eq!(UmiLocation::from_str("Read1"), Some(UmiLocation::Read1));
assert_eq!(UmiLocation::from_str("READ1"), Some(UmiLocation::Read1));
assert_eq!(UmiLocation::from_str("read2"), Some(UmiLocation::Read2));
assert_eq!(UmiLocation::from_str("index"), Some(UmiLocation::Index));
assert_eq!(UmiLocation::from_str("per_index"), Some(UmiLocation::PerIndex));
assert_eq!(UmiLocation::from_str("perindex"), Some(UmiLocation::PerIndex));
assert_eq!(UmiLocation::from_str("invalid"), None);
}
#[test]
fn test_umi_location_as_str() {
assert_eq!(UmiLocation::Read1.as_str(), "read1");
assert_eq!(UmiLocation::Read2.as_str(), "read2");
assert_eq!(UmiLocation::Index.as_str(), "index");
assert_eq!(UmiLocation::PerIndex.as_str(), "per_index");
}
#[test]
fn test_umi_config_default() {
let config = UmiConfig::default();
assert!(!config.enabled);
assert_eq!(config.location, UmiLocation::Read1);
assert_eq!(config.length, 0);
assert_eq!(config.prefix, "");
assert_eq!(config.skip, 0);
assert_eq!(config.separator, ":");
}
#[test]
fn test_umi_config_builder() {
let config = UmiConfig::new()
.enabled()
.with_location(UmiLocation::Read2)
.with_length(8)
.with_prefix("UMI")
.with_skip(2)
.with_separator("_");
assert!(config.enabled);
assert_eq!(config.location, UmiLocation::Read2);
assert_eq!(config.length, 8);
assert_eq!(config.prefix, "UMI");
assert_eq!(config.skip, 2);
assert_eq!(config.separator, "_");
assert_eq!(config.total_consumed(), 10);
}
#[test]
fn test_umi_config_is_enabled() {
let disabled = UmiConfig::default();
assert!(!disabled.is_enabled());
let enabled_no_length = UmiConfig::new().enabled();
assert!(!enabled_no_length.is_enabled());
let enabled_with_length = UmiConfig::new().with_length(8);
assert!(enabled_with_length.is_enabled());
}
#[test]
fn test_umi_processor_extract_umi() {
let config = UmiConfig::new().with_location(UmiLocation::Read1).with_length(8);
let processor = UmiProcessor::new(config);
let seq = b"ACGTACGTNNNNNNNNNNNN";
let umi = processor.extract_umi(seq, None);
assert_eq!(umi, Some(b"ACGTACGT".to_vec()));
}
#[test]
fn test_umi_processor_extract_with_skip() {
let config = UmiConfig::new()
.with_location(UmiLocation::Read1)
.with_length(6)
.with_skip(2);
let processor = UmiProcessor::new(config);
let seq = b"XXACGTACNNNNNNNN";
let umi = processor.extract_umi(seq, None);
assert_eq!(umi, Some(b"ACGTAC".to_vec()));
}
#[test]
fn test_umi_processor_extract_too_short() {
let config = UmiConfig::new().with_location(UmiLocation::Read1).with_length(20);
let processor = UmiProcessor::new(config);
let seq = b"ACGTACGT"; let umi = processor.extract_umi(seq, None);
assert_eq!(umi, None);
}
#[test]
fn test_umi_processor_extract_from_index() {
let config = UmiConfig::new().with_location(UmiLocation::Index).with_length(6);
let processor = UmiProcessor::new(config);
let seq = b"MAINSEQUENCE";
let index = b"UMIUMIEXTRA";
let umi = processor.extract_umi(seq, Some(index));
assert_eq!(umi, Some(b"UMIUMI".to_vec()));
}
#[test]
fn test_umi_processor_add_umi_to_name() {
let config = UmiConfig::new()
.with_length(8)
.with_prefix("UMI")
.with_separator("_");
let processor = UmiProcessor::new(config);
let name = b"read1";
let umi = b"ACGTACGT";
let new_name = processor.add_umi_to_name(name, umi);
assert_eq!(new_name, b"read1_UMIACGTACGT".to_vec());
}
#[test]
fn test_umi_processor_add_umi_to_name_no_prefix() {
let config = UmiConfig::new().with_length(8).with_separator(":");
let processor = UmiProcessor::new(config);
let name = b"read1";
let umi = b"ACGTACGT";
let new_name = processor.add_umi_to_name(name, umi);
assert_eq!(new_name, b"read1:ACGTACGT".to_vec());
}
#[test]
fn test_umi_processor_process_read() {
let config = UmiConfig::new()
.with_location(UmiLocation::Read1)
.with_length(4)
.with_separator(":");
let processor = UmiProcessor::new(config);
let name = b"read1";
let seq = b"ACGTNNNNNNNNNNNN";
let qual = b"IIIIJJJJJJJJJJJJ";
let result = processor.process_read(name, seq, qual, None);
assert!(result.is_some());
let (new_name, new_seq, new_qual) = result.unwrap();
assert_eq!(new_name, b"read1:ACGT".to_vec());
assert_eq!(new_seq, b"NNNNNNNNNNNN".to_vec());
assert_eq!(new_qual, b"JJJJJJJJJJJJ".to_vec());
}
#[test]
fn test_umi_processor_process_read_with_skip() {
let config = UmiConfig::new()
.with_location(UmiLocation::Read1)
.with_length(4)
.with_skip(2)
.with_separator(":");
let processor = UmiProcessor::new(config);
let name = b"read1";
let seq = b"XXACGTNNNNNNNN";
let qual = b"IIIIIIJJJJJJJJ";
let result = processor.process_read(name, seq, qual, None);
assert!(result.is_some());
let (new_name, new_seq, new_qual) = result.unwrap();
assert_eq!(new_name, b"read1:ACGT".to_vec());
assert_eq!(new_seq, b"NNNNNNNN".to_vec()); assert_eq!(new_qual, b"JJJJJJJJ".to_vec());
}
#[test]
fn test_umi_processor_process_read_disabled() {
let config = UmiConfig::disabled();
let processor = UmiProcessor::new(config);
let name = b"read1";
let seq = b"ACGTNNNN";
let qual = b"IIIIJJJJ";
let result = processor.process_read(name, seq, qual, None);
assert!(result.is_some());
let (new_name, new_seq, new_qual) = result.unwrap();
assert_eq!(new_name, b"read1".to_vec());
assert_eq!(new_seq, b"ACGTNNNN".to_vec());
assert_eq!(new_qual, b"IIIIJJJJ".to_vec());
}
#[test]
fn test_umi_processor_process_paired_reads_read1() {
let config = UmiConfig::new()
.with_location(UmiLocation::Read1)
.with_length(4)
.with_separator(":");
let processor = UmiProcessor::new(config);
let result = processor.process_paired_reads(
b"read1/1",
b"ACGTNNNNNNNN",
b"IIIIJJJJJJJJ",
b"read1/2",
b"TGCANNNNNNNN",
b"KKKKLLLLLLLL",
None,
);
assert!(result.is_some());
let ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) = result.unwrap();
assert_eq!(r1_name, b"read1/1:ACGT".to_vec());
assert_eq!(r1_seq, b"NNNNNNNN".to_vec()); assert_eq!(r1_qual, b"JJJJJJJJ".to_vec());
assert_eq!(r2_name, b"read1/2:ACGT".to_vec()); assert_eq!(r2_seq, b"TGCANNNNNNNN".to_vec()); assert_eq!(r2_qual, b"KKKKLLLLLLLL".to_vec());
}
#[test]
fn test_umi_processor_process_paired_reads_read2() {
let config = UmiConfig::new()
.with_location(UmiLocation::Read2)
.with_length(4)
.with_separator(":");
let processor = UmiProcessor::new(config);
let result = processor.process_paired_reads(
b"read1/1",
b"ACGTNNNNNNNN",
b"IIIIJJJJJJJJ",
b"read1/2",
b"TGCANNNNNNNN",
b"KKKKLLLLLLLL",
None,
);
assert!(result.is_some());
let ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) = result.unwrap();
assert_eq!(r1_name, b"read1/1:TGCA".to_vec()); assert_eq!(r1_seq, b"ACGTNNNNNNNN".to_vec()); assert_eq!(r1_qual, b"IIIIJJJJJJJJ".to_vec());
assert_eq!(r2_name, b"read1/2:TGCA".to_vec());
assert_eq!(r2_seq, b"NNNNNNNN".to_vec()); assert_eq!(r2_qual, b"LLLLLLLL".to_vec());
}
#[test]
fn test_umi_processor_process_paired_reads_index() {
let config = UmiConfig::new()
.with_location(UmiLocation::Index)
.with_length(6)
.with_separator(":");
let processor = UmiProcessor::new(config);
let result = processor.process_paired_reads(
b"read1/1",
b"ACGTNNNNNNNN",
b"IIIIJJJJJJJJ",
b"read1/2",
b"TGCANNNNNNNN",
b"KKKKLLLLLLLL",
Some(b"UMIUMIEXTRA"),
);
assert!(result.is_some());
let ((r1_name, r1_seq, r1_qual), (r2_name, r2_seq, r2_qual)) = result.unwrap();
assert_eq!(r1_name, b"read1/1:UMIUMI".to_vec());
assert_eq!(r1_seq, b"ACGTNNNNNNNN".to_vec()); assert_eq!(r1_qual, b"IIIIJJJJJJJJ".to_vec());
assert_eq!(r2_name, b"read1/2:UMIUMI".to_vec());
assert_eq!(r2_seq, b"TGCANNNNNNNN".to_vec()); assert_eq!(r2_qual, b"KKKKLLLLLLLL".to_vec());
}
#[test]
fn test_umi_processor_trim_start() {
let config_read1 = UmiConfig::new()
.with_location(UmiLocation::Read1)
.with_length(8)
.with_skip(2);
let processor_read1 = UmiProcessor::new(config_read1);
assert_eq!(processor_read1.trim_start(), 10);
let config_index = UmiConfig::new()
.with_location(UmiLocation::Index)
.with_length(8);
let processor_index = UmiProcessor::new(config_index);
assert_eq!(processor_index.trim_start(), 0);
let config_disabled = UmiConfig::disabled();
let processor_disabled = UmiProcessor::new(config_disabled);
assert_eq!(processor_disabled.trim_start(), 0);
}
#[test]
fn test_parse_umi_location() {
assert!(parse_umi_location("read1").is_ok());
assert!(parse_umi_location("invalid").is_err());
}
}