use bio::bio_types::sequence::Sequence;
use std::sync::Arc;
use crate::errors::seq_to_string_or_log;
use crate::lib_spec::{DistanceMetric, Library, LibraryRegion, PartialMatching, merge_matches};
#[derive(Debug, Hash, PartialEq, Eq, Clone)]
pub struct RegionKey {
pub id: String,
pub sequence: Sequence,
pub completeness: RegionCompleteness,
}
impl RegionKey {
pub fn new(id: String, sequence: Sequence, completeness: RegionCompleteness) -> Self {
Self {
id,
sequence,
completeness,
}
}
}
#[derive(Debug)]
pub struct ObservedRegion {
pub id: String,
pub seq: Sequence,
pub completeness: RegionCompleteness,
pub nearest_matches: RegionMatch,
}
impl ObservedRegion {
pub fn new(id: String, seq: &[u8], complete: RegionCompleteness) -> Self {
Self {
id,
seq: seq.to_vec(),
completeness: complete,
nearest_matches: RegionMatch::Uncompared,
}
}
pub fn len(&self) -> usize {
self.seq.len()
}
pub fn is_empty(&self) -> bool {
self.seq.is_empty()
}
pub fn is_compared_to_library(&self) -> bool {
!matches!(self.nearest_matches, RegionMatch::Uncompared)
}
pub fn compare_to_library(
&self,
library: &Library,
distance_metric: DistanceMetric,
max_matches: usize,
) -> RegionMatch {
let lib_match = match self.completeness {
RegionCompleteness::Complete => {
match library.lookup(&self.id, &self.seq, distance_metric, PartialMatching::Full) {
Err(_) => return RegionMatch::NoLibrary { seq: None },
Ok(x) => x,
}
}
RegionCompleteness::Partial5Prime => {
match library.lookup(
&self.id,
&self.seq,
distance_metric,
PartialMatching::ThreePrimeOnly,
) {
Err(_) => return RegionMatch::NoLibrary { seq: None },
Ok(x) => x,
}
}
RegionCompleteness::Partial3Prime => {
match library.lookup(
&self.id,
&self.seq,
distance_metric,
PartialMatching::FivePrimeOnly,
) {
Err(_) => return RegionMatch::NoLibrary { seq: None },
Ok(x) => x,
}
}
RegionCompleteness::MissingCenter { split_ind }
| RegionCompleteness::Overlapping { split_ind } => {
let left_match = library.lookup(
&self.id,
&self.seq[0..split_ind],
distance_metric,
PartialMatching::FivePrimeOnly,
);
let right_match = library.lookup(
&self.id,
&self.seq[split_ind + 1..self.seq.len()],
distance_metric,
PartialMatching::ThreePrimeOnly,
);
match (left_match, right_match) {
(Ok(left), Ok(right)) => merge_matches(left, right),
(_, Err(_)) | (Err(_), _) => return RegionMatch::NoLibrary { seq: None },
}
}
};
match lib_match {
None => RegionMatch::Unmatched,
Some(x) => {
if x.matches.len() == 1 {
RegionMatch::Match {
seq_match: x.matches[0].clone(),
distance: x.distance,
}
} else if x.matches.len() > max_matches {
RegionMatch::Overmatched {
distance: x.distance,
matches: x.matches.len(),
}
} else {
RegionMatch::MultiMatch {
seq_matches: x.matches,
distance: x.distance,
}
}
}
}
}
pub fn to_tsv_chunk(&self) -> String {
let mut seq = match String::from_utf8(self.seq.clone()) {
Ok(x) => x,
Err(_) => panic!("Error converting Vec<u8> sequence to string"),
};
match self.completeness {
RegionCompleteness::Complete
| RegionCompleteness::MissingCenter { .. }
| RegionCompleteness::Overlapping { .. } => {}
RegionCompleteness::Partial5Prime => seq.insert(0, '^'),
RegionCompleteness::Partial3Prime => seq.push('^'),
};
let (nearest, distance, matches) = self.nearest_matches.to_str_fields();
format!("{}\t{}\t{}\t{}", seq, nearest, distance, matches)
}
}
#[derive(Debug, Eq, Hash, PartialEq, Clone, Copy)]
pub enum RegionCompleteness {
Complete,
Partial5Prime,
Partial3Prime,
MissingCenter { split_ind: usize },
Overlapping { split_ind: usize },
}
#[derive(Debug, Eq, Hash, PartialEq, Clone)]
pub enum RegionMatch {
Uncompared,
Match {
seq_match: Arc<LibraryRegion>,
distance: u64,
},
MultiMatch {
seq_matches: Vec<Arc<LibraryRegion>>,
distance: u64,
},
Overmatched { distance: u64, matches: usize },
Unmatched,
NoLibrary { seq: Option<Sequence> },
}
impl RegionMatch {
fn to_str_fields(&self) -> (String, String, String) {
match self {
RegionMatch::Uncompared | RegionMatch::Unmatched | RegionMatch::NoLibrary { .. } => {
("".to_string(), "".to_string(), "0".to_string())
}
RegionMatch::Overmatched { distance, matches } => {
("".to_string(), distance.to_string(), matches.to_string())
}
RegionMatch::Match {
seq_match,
distance,
} => (
seq_to_string_or_log(&seq_match.sequence),
distance.to_string(),
"1".to_string(),
),
RegionMatch::MultiMatch {
seq_matches,
distance,
} => {
let mut seqs = seq_matches
.iter()
.map(|x| seq_to_string_or_log(&x.sequence))
.collect::<Vec<_>>();
seqs.sort();
(seqs.join(","), distance.to_string(), seqs.len().to_string())
}
}
}
pub fn str_sequence(&self) -> String {
match self {
RegionMatch::Uncompared | RegionMatch::Unmatched | RegionMatch::Overmatched { .. } => {
"".to_string()
}
RegionMatch::NoLibrary { seq } => match seq {
Some(s) => seq_to_string_or_log(s),
None => "".to_string(),
},
RegionMatch::Match { seq_match, .. } => seq_to_string_or_log(&seq_match.sequence),
RegionMatch::MultiMatch { seq_matches, .. } => seq_matches
.iter()
.map(|x| seq_to_string_or_log(&x.sequence))
.collect::<Vec<_>>()
.join(","),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_region(id: &str, seq: &[u8], c: RegionCompleteness) -> ObservedRegion {
ObservedRegion::new(id.to_string(), seq, c)
}
#[test]
fn new_complete_region_holds_data() {
let r = make_region("barcode", b"ACGTACGT", RegionCompleteness::Complete);
assert_eq!(r.id, "barcode");
assert_eq!(r.seq, b"ACGTACGT");
assert_eq!(r.len(), 8);
assert!(matches!(r.completeness, RegionCompleteness::Complete));
}
#[test]
fn empty_sequence_is_valid() {
let r = make_region("empty", b"", RegionCompleteness::Complete);
assert_eq!(r.id, "empty");
assert_eq!(r.len(), 0);
assert!(r.is_empty());
assert_eq!(r.seq, b"");
}
#[test]
fn preserves_bytes_verbatim() {
let weird = b"ACGTNN--acgt\x00\xff";
let r = make_region("weird", weird, RegionCompleteness::Complete);
assert_eq!(r.seq, weird, "region bytes changed unexpectedly");
assert_eq!(r.len(), weird.len());
}
#[test]
fn owns_its_sequence() {
let mut buf = b"AAAA".to_vec();
let r = make_region("id", &buf, RegionCompleteness::Complete);
buf[0] = b'T'; assert_eq!(r.seq, b"AAAA", "region leaked aliasing to input slice");
}
#[test]
fn size_extremes_smoke() {
let r1 = make_region("s", b"A", RegionCompleteness::Complete);
assert_eq!(r1.len(), 1);
let big = vec![b'G'; 100_000];
let r2 = make_region("big", &big, RegionCompleteness::Complete);
assert_eq!(r2.len(), 100_000);
assert_eq!(r2.seq[0], b'G');
assert_eq!(r2.seq[99_999], b'G');
}
}