use super::TrimResult;
#[derive(Debug, Clone)]
pub struct TailConfig {
pub min_length: usize,
pub enabled_bases: [bool; 4],
pub trim_5_prime: bool,
pub max_mismatch: usize,
}
impl Default for TailConfig {
fn default() -> Self {
Self {
min_length: 10,
enabled_bases: [true, true, true, true], trim_5_prime: false,
max_mismatch: 1,
}
}
}
impl TailConfig {
pub fn new() -> Self {
Self::default()
}
pub fn poly_a() -> Self {
Self {
enabled_bases: [true, false, false, false],
..Self::default()
}
}
pub fn poly_g() -> Self {
Self {
enabled_bases: [false, false, true, false],
..Self::default()
}
}
pub fn poly_x() -> Self {
Self::default()
}
pub fn with_min_length(mut self, length: usize) -> Self {
self.min_length = length;
self
}
pub fn with_base_enabled(mut self, base: u8, enabled: bool) -> Self {
if let Some(idx) = base_to_index(base) {
self.enabled_bases[idx] = enabled;
}
self
}
pub fn with_trim_5_prime(mut self, enabled: bool) -> Self {
self.trim_5_prime = enabled;
self
}
pub fn with_max_mismatch(mut self, max: usize) -> Self {
self.max_mismatch = max;
self
}
}
#[inline]
fn base_to_index(base: u8) -> Option<usize> {
match base {
b'A' | b'a' => Some(0),
b'T' | b't' => Some(1),
b'G' | b'g' => Some(2),
b'C' | b'c' => Some(3),
_ => None,
}
}
pub fn trim_poly_tail(seq: &[u8], config: &TailConfig) -> TrimResult {
if seq.is_empty() || config.min_length == 0 {
return TrimResult::full(seq.len());
}
let mut start = 0;
let end = trim_3_prime(seq, config);
if config.trim_5_prime && end > 0 {
start = trim_5_prime(&seq[..end], config);
}
if start >= end {
return TrimResult::empty();
}
TrimResult::new(start, end)
}
fn trim_3_prime(seq: &[u8], config: &TailConfig) -> usize {
let len = seq.len();
if len < config.min_length {
return len;
}
for (idx, &enabled) in config.enabled_bases.iter().enumerate() {
if !enabled {
continue;
}
let target_base = index_to_base(idx);
if let Some(trim_pos) = find_poly_tail_3prime(seq, target_base, config.min_length, config.max_mismatch) {
return trim_pos;
}
}
len
}
fn trim_5_prime(seq: &[u8], config: &TailConfig) -> usize {
let len = seq.len();
if len < config.min_length {
return 0;
}
for (idx, &enabled) in config.enabled_bases.iter().enumerate() {
if !enabled {
continue;
}
let target_base = index_to_base(idx);
if let Some(trim_pos) = find_poly_tail_5prime(seq, target_base, config.min_length, config.max_mismatch) {
return trim_pos;
}
}
0
}
fn find_poly_tail_3prime(seq: &[u8], target: u8, min_length: usize, max_mismatch: usize) -> Option<usize> {
let len = seq.len();
if len < min_length {
return None;
}
let target_upper = target.to_ascii_uppercase();
let target_lower = target.to_ascii_lowercase();
let mut run_length = 0;
let mut mismatches = 0;
let mut trim_pos = len;
for i in (0..len).rev() {
let base = seq[i];
if base == target_upper || base == target_lower {
run_length += 1;
trim_pos = i;
} else {
mismatches += 1;
if mismatches > max_mismatch {
break;
}
run_length += 1;
trim_pos = i;
}
}
if run_length >= min_length {
Some(trim_pos)
} else {
None
}
}
fn find_poly_tail_5prime(seq: &[u8], target: u8, min_length: usize, max_mismatch: usize) -> Option<usize> {
let len = seq.len();
if len < min_length {
return None;
}
let target_upper = target.to_ascii_uppercase();
let target_lower = target.to_ascii_lowercase();
let mut run_length = 0;
let mut mismatches = 0;
let mut trim_pos = 0;
for (i, &base) in seq.iter().enumerate().take(len) {
if base == target_upper || base == target_lower {
run_length += 1;
trim_pos = i + 1;
} else {
mismatches += 1;
if mismatches > max_mismatch {
break;
}
run_length += 1;
trim_pos = i + 1;
}
}
if run_length >= min_length {
Some(trim_pos)
} else {
None
}
}
#[inline]
fn index_to_base(idx: usize) -> u8 {
match idx {
0 => b'A',
1 => b'T',
2 => b'G',
3 => b'C',
_ => b'N',
}
}
pub type TailTrimmer = TailConfig;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tail_config_default() {
let config = TailConfig::default();
assert_eq!(config.min_length, 10);
assert!(config.enabled_bases.iter().all(|&b| b));
assert!(!config.trim_5_prime);
}
#[test]
fn test_tail_config_poly_a() {
let config = TailConfig::poly_a();
assert!(config.enabled_bases[0]); assert!(!config.enabled_bases[1]); assert!(!config.enabled_bases[2]); assert!(!config.enabled_bases[3]); }
#[test]
fn test_tail_config_poly_g() {
let config = TailConfig::poly_g();
assert!(!config.enabled_bases[0]); assert!(!config.enabled_bases[1]); assert!(config.enabled_bases[2]); assert!(!config.enabled_bases[3]); }
#[test]
fn test_trim_poly_tail_3prime() {
let seq = b"ACGTACGTACGTAAAAAAAAAA";
let config = TailConfig::poly_a().with_min_length(5);
let result = trim_poly_tail(seq, &config);
assert!(result.end < seq.len());
assert_eq!(result.start, 0);
}
#[test]
fn test_trim_poly_g_tail() {
let seq = b"ACGTACGTACGTGGGGGGGGGG";
let config = TailConfig::poly_g().with_min_length(5);
let result = trim_poly_tail(seq, &config);
assert!(result.end < seq.len());
}
#[test]
fn test_trim_poly_tail_no_tail() {
let seq = b"ACGTACGTACGTACGT";
let config = TailConfig::poly_a().with_min_length(10);
let result = trim_poly_tail(seq, &config);
assert_eq!(result.start, 0);
assert_eq!(result.end, seq.len());
}
#[test]
fn test_trim_poly_tail_empty() {
let config = TailConfig::default();
let result = trim_poly_tail(&[], &config);
assert_eq!(result.start, 0);
assert_eq!(result.end, 0);
}
#[test]
fn test_trim_poly_tail_short_tail() {
let seq = b"ACGTACGTACGTAAA"; let config = TailConfig::poly_a().with_min_length(10);
let result = trim_poly_tail(seq, &config);
assert_eq!(result.end, seq.len()); }
#[test]
fn test_trim_poly_tail_5prime() {
let seq = b"AAAAAAAAACGTACGT";
let config = TailConfig::poly_a().with_min_length(5).with_trim_5_prime(true);
let result = trim_poly_tail(seq, &config);
assert!(result.start > 0);
}
#[test]
fn test_trim_poly_tail_both_ends() {
let seq = b"AAAAACGTAAAAAA";
let config = TailConfig::poly_a().with_min_length(4).with_trim_5_prime(true);
let result = trim_poly_tail(seq, &config);
assert!(result.start > 0 || result.end < seq.len());
}
#[test]
fn test_base_to_index() {
assert_eq!(base_to_index(b'A'), Some(0));
assert_eq!(base_to_index(b'a'), Some(0));
assert_eq!(base_to_index(b'T'), Some(1));
assert_eq!(base_to_index(b't'), Some(1));
assert_eq!(base_to_index(b'G'), Some(2));
assert_eq!(base_to_index(b'g'), Some(2));
assert_eq!(base_to_index(b'C'), Some(3));
assert_eq!(base_to_index(b'c'), Some(3));
assert_eq!(base_to_index(b'N'), None);
}
#[test]
fn test_index_to_base() {
assert_eq!(index_to_base(0), b'A');
assert_eq!(index_to_base(1), b'T');
assert_eq!(index_to_base(2), b'G');
assert_eq!(index_to_base(3), b'C');
assert_eq!(index_to_base(4), b'N');
}
#[test]
fn test_config_builder() {
let config = TailConfig::new()
.with_min_length(15)
.with_base_enabled(b'A', true)
.with_base_enabled(b'T', false)
.with_trim_5_prime(true)
.with_max_mismatch(2);
assert_eq!(config.min_length, 15);
assert!(config.enabled_bases[0]);
assert!(!config.enabled_bases[1]);
assert!(config.trim_5_prime);
assert_eq!(config.max_mismatch, 2);
}
#[test]
fn test_trim_with_mismatch_tolerance() {
let seq = b"ACGTACGTAAAAATAAAAAA";
let config = TailConfig::poly_a().with_min_length(10).with_max_mismatch(1);
let result = trim_poly_tail(seq, &config);
assert!(result.end < seq.len());
}
#[test]
fn test_lowercase_handling() {
let seq = b"ACGTACGTaaaaaaaaaaa";
let config = TailConfig::poly_a().with_min_length(5);
let result = trim_poly_tail(seq, &config);
assert!(result.end < seq.len());
}
}