use super::TrimResult;
#[derive(Debug, Clone)]
pub struct LongReadConfig {
pub adapter: Option<Vec<u8>>,
pub max_mismatch: usize,
pub min_segment_length: usize,
pub quality_threshold: u8,
pub min_low_quality_length: usize,
}
impl Default for LongReadConfig {
fn default() -> Self {
Self {
adapter: None,
max_mismatch: 3,
min_segment_length: 200,
quality_threshold: 7,
min_low_quality_length: 50,
}
}
}
impl LongReadConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_adapter(mut self, adapter: Vec<u8>) -> Self {
self.adapter = Some(adapter);
self
}
pub fn with_max_mismatch(mut self, max: usize) -> Self {
self.max_mismatch = max;
self
}
pub fn with_min_segment_length(mut self, length: usize) -> Self {
self.min_segment_length = length;
self
}
pub fn with_quality_threshold(mut self, threshold: u8) -> Self {
self.quality_threshold = threshold;
self
}
pub fn with_min_low_quality_length(mut self, length: usize) -> Self {
self.min_low_quality_length = length;
self
}
}
pub fn split_on_adapter(seq: &[u8], _qual: &[u8], adapter: &[u8]) -> Vec<TrimResult> {
if seq.is_empty() || adapter.is_empty() {
return vec![TrimResult::full(seq.len())];
}
let positions = find_all_adapter_positions(seq, adapter, 3);
if positions.is_empty() {
return vec![TrimResult::full(seq.len())];
}
let mut segments = Vec::new();
let mut prev_end = 0;
for pos in positions {
if pos > prev_end {
segments.push(TrimResult::new(prev_end, pos));
}
prev_end = pos + adapter.len();
}
if prev_end < seq.len() {
segments.push(TrimResult::new(prev_end, seq.len()));
}
segments.retain(|s| s.len() >= 200);
if segments.is_empty() {
vec![TrimResult::full(seq.len())]
} else {
segments
}
}
pub fn split_on_low_quality(
seq: &[u8],
qual: &[u8],
threshold: u8,
min_length: usize,
) -> Vec<TrimResult> {
let len = qual.len().min(seq.len());
if len == 0 {
return vec![TrimResult::empty()];
}
let low_qual_regions = find_low_quality_regions(qual, threshold, min_length);
if low_qual_regions.is_empty() {
return vec![TrimResult::full(len)];
}
let mut segments = Vec::new();
let mut prev_end = 0;
for (start, end) in low_qual_regions {
if start > prev_end {
segments.push(TrimResult::new(prev_end, start));
}
prev_end = end;
}
if prev_end < len {
segments.push(TrimResult::new(prev_end, len));
}
segments.retain(|s| s.len() >= 200);
if segments.is_empty() {
vec![TrimResult::full(len)]
} else {
segments
}
}
fn find_all_adapter_positions(seq: &[u8], adapter: &[u8], max_mismatch: usize) -> Vec<usize> {
let mut positions = Vec::new();
let seq_len = seq.len();
let adapter_len = adapter.len();
if seq_len < adapter_len {
return positions;
}
for start in 0..=seq_len - adapter_len {
let mut mismatches = 0;
let mut matched = true;
for i in 0..adapter_len {
if seq[start + i] != adapter[i] {
mismatches += 1;
if mismatches > max_mismatch {
matched = false;
break;
}
}
}
if matched {
positions.push(start);
}
}
positions
}
fn find_low_quality_regions(qual: &[u8], threshold: u8, min_length: usize) -> Vec<(usize, usize)> {
let mut regions = Vec::new();
let len = qual.len();
let threshold_ascii = threshold + 33;
let mut in_low_qual = false;
let mut region_start = 0;
for (i, &q) in qual.iter().enumerate().take(len) {
let is_low = q < threshold_ascii;
if is_low && !in_low_qual {
in_low_qual = true;
region_start = i;
} else if !is_low && in_low_qual {
in_low_qual = false;
let region_len = i - region_start;
if region_len >= min_length {
regions.push((region_start, i));
}
}
}
if in_low_qual {
let region_len = len - region_start;
if region_len >= min_length {
regions.push((region_start, len));
}
}
regions
}
pub type LongReadTrimmer = LongReadConfig;
#[cfg(test)]
mod tests {
use super::*;
fn make_qual(scores: &[u8]) -> Vec<u8> {
scores.iter().map(|&s| s + 33).collect()
}
#[test]
fn test_long_read_config_default() {
let config = LongReadConfig::default();
assert!(config.adapter.is_none());
assert_eq!(config.max_mismatch, 3);
assert_eq!(config.min_segment_length, 200);
assert_eq!(config.quality_threshold, 7);
}
#[test]
fn test_split_on_adapter_no_adapter() {
let seq = b"ACGTACGTACGT";
let qual = make_qual(&[30; 12]);
let segments = split_on_adapter(seq, &qual, b"NNNNNN");
assert_eq!(segments.len(), 1);
assert_eq!(segments[0], TrimResult::full(12));
}
#[test]
fn test_split_on_adapter_empty() {
let segments = split_on_adapter(&[], &[], b"ACGT");
assert_eq!(segments.len(), 1);
}
#[test]
fn test_split_on_adapter_single() {
let adapter = b"ACGTACGTAC";
let mut seq = Vec::new();
seq.extend_from_slice(&[b'G'; 250]); seq.extend_from_slice(adapter);
seq.extend_from_slice(&[b'T'; 250]);
let qual = make_qual(&vec![30; seq.len()]);
let segments = split_on_adapter(&seq, &qual, adapter);
assert_eq!(segments.len(), 2);
assert!(segments[0].len() >= 200);
assert!(segments[1].len() >= 200);
let total_len: usize = segments.iter().map(|s| s.len()).sum();
assert!(total_len > 400); }
#[test]
fn test_split_on_low_quality_no_low_regions() {
let seq = b"ACGTACGTACGT";
let qual = make_qual(&[30; 12]);
let segments = split_on_low_quality(seq, &qual, 10, 5);
assert_eq!(segments.len(), 1);
assert_eq!(segments[0], TrimResult::full(12));
}
#[test]
fn test_split_on_low_quality_empty() {
let segments = split_on_low_quality(&[], &[], 10, 5);
assert_eq!(segments.len(), 1);
assert!(segments[0].is_empty());
}
#[test]
fn test_split_on_low_quality_internal_region() {
let mut qual_scores = vec![30u8; 700];
for i in 300..400 {
qual_scores[i] = 5; }
let qual = make_qual(&qual_scores);
let seq = vec![b'A'; 700];
let segments = split_on_low_quality(&seq, &qual, 10, 50);
assert_eq!(segments.len(), 2);
}
#[test]
fn test_split_on_low_quality_short_region_ignored() {
let mut qual_scores = vec![30u8; 100];
for i in 40..50 {
qual_scores[i] = 5; }
let qual = make_qual(&qual_scores);
let seq = vec![b'A'; 100];
let segments = split_on_low_quality(&seq, &qual, 10, 50);
assert_eq!(segments.len(), 1);
}
#[test]
fn test_find_all_adapter_positions_exact() {
let seq = b"AAAGGGGAACCCGGGGT";
let adapter = b"GGGG";
let positions = find_all_adapter_positions(seq, adapter, 0);
assert!(positions.contains(&3));
assert!(positions.contains(&12));
}
#[test]
fn test_find_all_adapter_positions_with_mismatch() {
let seq = b"AAAGGGXAAA";
let adapter = b"GGGG";
let positions = find_all_adapter_positions(seq, adapter, 1);
assert!(positions.contains(&3));
}
#[test]
fn test_find_low_quality_regions() {
let qual = make_qual(&[30, 30, 5, 5, 5, 5, 5, 30, 30]);
let regions = find_low_quality_regions(&qual, 10, 3);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0], (2, 7));
}
#[test]
fn test_find_low_quality_regions_at_end() {
let qual = make_qual(&[30, 30, 30, 5, 5, 5, 5, 5]);
let regions = find_low_quality_regions(&qual, 10, 3);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0], (3, 8));
}
#[test]
fn test_find_low_quality_regions_at_start() {
let qual = make_qual(&[5, 5, 5, 5, 5, 30, 30, 30]);
let regions = find_low_quality_regions(&qual, 10, 3);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0], (0, 5));
}
#[test]
fn test_config_builder() {
let config = LongReadConfig::new()
.with_adapter(b"ACGT".to_vec())
.with_max_mismatch(5)
.with_min_segment_length(500)
.with_quality_threshold(10)
.with_min_low_quality_length(100);
assert_eq!(config.adapter.as_deref(), Some(b"ACGT".as_slice()));
assert_eq!(config.max_mismatch, 5);
assert_eq!(config.min_segment_length, 500);
assert_eq!(config.quality_threshold, 10);
assert_eq!(config.min_low_quality_length, 100);
}
}