use rustc_hash::FxHashMap;
use super::TrimResult;
const KMER_SIZE: usize = 8;
#[derive(Debug, Clone)]
pub struct AdapterKmerIndex {
kmer_positions: FxHashMap<u64, Vec<usize>>,
adapter: Vec<u8>,
min_match_len: usize,
}
impl AdapterKmerIndex {
pub fn new(adapter: &[u8], min_match_len: usize) -> Self {
let mut kmer_positions: FxHashMap<u64, Vec<usize>> = FxHashMap::default();
if adapter.len() >= KMER_SIZE {
for i in 0..=(adapter.len() - KMER_SIZE) {
let kmer = &adapter[i..i + KMER_SIZE];
if let Some(hash) = hash_kmer(kmer) {
kmer_positions.entry(hash).or_default().push(i);
}
}
}
Self {
kmer_positions,
adapter: adapter.to_vec(),
min_match_len,
}
}
#[inline]
pub fn detect(&self, seq: &[u8], max_mismatch: usize) -> Option<usize> {
if seq.len() < KMER_SIZE || self.adapter.is_empty() {
return None;
}
let seq_len = seq.len();
for i in 0..=(seq_len - KMER_SIZE) {
let kmer = &seq[i..i + KMER_SIZE];
let hash = match hash_kmer(kmer) {
Some(h) => h,
None => continue, };
if let Some(adapter_positions) = self.kmer_positions.get(&hash) {
for &adapter_pos in adapter_positions {
let seq_adapter_start = i.saturating_sub(adapter_pos);
if let Some(pos) = self.verify_match(seq, seq_adapter_start, max_mismatch) {
if pos >= self.min_match_len {
return Some(pos);
}
}
}
}
}
None
}
#[inline]
fn verify_match(&self, seq: &[u8], start: usize, max_mismatch: usize) -> Option<usize> {
let seq_len = seq.len();
let adapter_len = self.adapter.len();
if start >= seq_len {
return None;
}
let overlap_len = (seq_len - start).min(adapter_len);
if overlap_len < KMER_SIZE {
return None;
}
let max_allowed = (overlap_len / 8).max(1).min(max_mismatch);
let mut mismatches = 0;
for i in 0..overlap_len {
if seq[start + i] != self.adapter[i] {
mismatches += 1;
if mismatches > max_allowed {
return None;
}
}
}
Some(start)
}
}
#[inline]
fn hash_kmer(kmer: &[u8]) -> Option<u64> {
let mut hash: u64 = 0;
for &base in kmer {
let bits = match base {
b'A' | b'a' => 0u64,
b'C' | b'c' => 1u64,
b'G' | b'g' => 2u64,
b'T' | b't' => 3u64,
_ => return None, };
hash = (hash << 2) | bits;
}
Some(hash)
}
#[derive(Debug, Clone)]
pub struct AdapterIndices {
pub r1_index: Option<AdapterKmerIndex>,
pub r2_index: Option<AdapterKmerIndex>,
pub additional_indices: Vec<AdapterKmerIndex>,
}
impl AdapterIndices {
pub fn from_config(config: &AdapterConfig) -> Self {
let r1_index = config.adapter_r1.as_ref().map(|a| {
AdapterKmerIndex::new(a, config.min_match_length)
});
let r2_index = config.adapter_r2.as_ref().map(|a| {
AdapterKmerIndex::new(a, config.min_match_length)
});
let additional_indices = config.adapter_list
.iter()
.map(|(_, a)| AdapterKmerIndex::new(a, config.min_match_length))
.collect();
Self {
r1_index,
r2_index,
additional_indices,
}
}
#[inline]
pub fn detect(&self, seq: &[u8], max_mismatch: usize, min_match_length: usize) -> Option<usize> {
let mut best_pos: Option<usize> = None;
if let Some(ref index) = self.r1_index {
if let Some(pos) = index.detect(seq, max_mismatch) {
if pos >= min_match_length {
best_pos = Some(pos);
}
}
}
if let Some(ref index) = self.r2_index {
if let Some(pos) = index.detect(seq, max_mismatch) {
if pos >= min_match_length {
best_pos = match best_pos {
Some(prev) => Some(prev.min(pos)),
None => Some(pos),
};
}
}
}
for index in &self.additional_indices {
if let Some(pos) = index.detect(seq, max_mismatch) {
if pos >= min_match_length {
best_pos = match best_pos {
Some(prev) => Some(prev.min(pos)),
None => Some(pos),
};
}
}
}
best_pos
}
}
#[inline]
pub fn trim_adapter_indexed(seq: &[u8], indices: &AdapterIndices, config: &AdapterConfig) -> TrimResult {
if seq.is_empty() {
return TrimResult::empty();
}
match indices.detect(seq, config.max_mismatch, config.min_match_length) {
Some(pos) => TrimResult::new(0, pos),
None => TrimResult::full(seq.len()),
}
}
pub const ILLUMINA_TRUSEQ_R1: &[u8] = b"AGATCGGAAGAGCACACGTCTGAACTCCAGTCA";
pub const ILLUMINA_TRUSEQ_R2: &[u8] = b"AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT";
pub const NEXTERA_R1: &[u8] = b"CTGTCTCTTATACACATCT";
pub const NEXTERA_R2: &[u8] = b"CTGTCTCTTATACACATCT";
pub const BGI_R1: &[u8] = b"AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA";
pub const BGI_R2: &[u8] = b"AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG";
#[derive(Debug, Clone)]
pub struct AdapterConfig {
pub adapter_r1: Option<Vec<u8>>,
pub adapter_r2: Option<Vec<u8>>,
pub adapter_list: Vec<(String, Vec<u8>)>,
pub auto_detect: bool,
pub min_overlap: usize,
pub max_mismatch: usize,
pub min_match_length: usize,
pub start_adapter: Option<Vec<u8>>,
pub end_adapter: Option<Vec<u8>>,
pub distance_threshold: f64,
pub trimming_extension: usize,
}
impl Default for AdapterConfig {
fn default() -> Self {
Self {
adapter_r1: Some(ILLUMINA_TRUSEQ_R1.to_vec()),
adapter_r2: Some(ILLUMINA_TRUSEQ_R2.to_vec()),
adapter_list: Vec::new(),
auto_detect: false,
min_overlap: 30,
max_mismatch: 3,
min_match_length: 10,
start_adapter: None,
end_adapter: None,
distance_threshold: 0.25,
trimming_extension: 10,
}
}
}
impl AdapterConfig {
pub fn new() -> Self {
Self::default()
}
pub fn disabled() -> Self {
Self {
adapter_r1: None,
adapter_r2: None,
adapter_list: Vec::new(),
auto_detect: false,
min_overlap: 30,
max_mismatch: 3,
min_match_length: 10,
start_adapter: None,
end_adapter: None,
distance_threshold: 0.25,
trimming_extension: 10,
}
}
pub fn truseq() -> Self {
Self {
adapter_r1: Some(ILLUMINA_TRUSEQ_R1.to_vec()),
adapter_r2: Some(ILLUMINA_TRUSEQ_R2.to_vec()),
..Self::default()
}
}
pub fn nextera() -> Self {
Self {
adapter_r1: Some(NEXTERA_R1.to_vec()),
adapter_r2: Some(NEXTERA_R2.to_vec()),
..Self::default()
}
}
pub fn with_adapter_r1(mut self, adapter: Vec<u8>) -> Self {
self.adapter_r1 = Some(adapter);
self
}
pub fn with_adapter_r2(mut self, adapter: Vec<u8>) -> Self {
self.adapter_r2 = Some(adapter);
self
}
pub fn with_auto_detect(mut self, enabled: bool) -> Self {
self.auto_detect = enabled;
self
}
pub fn with_min_overlap(mut self, overlap: usize) -> Self {
self.min_overlap = overlap;
self
}
pub fn with_max_mismatch(mut self, max: usize) -> Self {
self.max_mismatch = max;
self
}
pub fn with_adapter_list(mut self, adapters: Vec<(String, Vec<u8>)>) -> Self {
self.adapter_list = adapters;
self
}
pub fn with_start_adapter(mut self, adapter: Vec<u8>) -> Self {
self.start_adapter = Some(adapter);
self
}
pub fn with_end_adapter(mut self, adapter: Vec<u8>) -> Self {
self.end_adapter = Some(adapter);
self
}
pub fn with_distance_threshold(mut self, threshold: f64) -> Self {
self.distance_threshold = threshold;
self
}
pub fn with_trimming_extension(mut self, extension: usize) -> Self {
self.trimming_extension = extension;
self
}
}
pub fn detect_adapter(seq: &[u8], adapter: &[u8], max_mismatch: usize) -> Option<usize> {
if seq.is_empty() || adapter.is_empty() {
return None;
}
let seq_len = seq.len();
let adapter_len = adapter.len();
for start in 0..seq_len {
let overlap_len = (seq_len - start).min(adapter_len);
if overlap_len < 8 {
continue;
}
let mut mismatches = 0;
let max_allowed = (overlap_len / 10).max(1).min(max_mismatch);
for i in 0..overlap_len {
if seq[start + i] != adapter[i] {
mismatches += 1;
if mismatches > max_allowed {
break;
}
}
}
if mismatches <= max_allowed {
return Some(start);
}
}
None
}
pub fn trim_adapter(seq: &[u8], config: &AdapterConfig) -> TrimResult {
if seq.is_empty() {
return TrimResult::empty();
}
let mut best_pos: Option<usize> = None;
if let Some(ref adapter) = config.adapter_r1 {
if let Some(pos) = detect_adapter(seq, adapter, config.max_mismatch) {
if pos >= config.min_match_length {
best_pos = Some(pos);
}
}
}
if let Some(ref adapter) = config.adapter_r2 {
if let Some(pos) = detect_adapter(seq, adapter, config.max_mismatch) {
if pos >= config.min_match_length {
best_pos = match best_pos {
Some(prev) => Some(prev.min(pos)),
None => Some(pos),
};
}
}
}
for (_name, adapter) in &config.adapter_list {
if let Some(pos) = detect_adapter(seq, adapter, config.max_mismatch) {
if pos >= config.min_match_length {
best_pos = match best_pos {
Some(prev) => Some(prev.min(pos)),
None => Some(pos),
};
}
}
}
match best_pos {
Some(pos) => TrimResult::new(0, pos),
None => TrimResult::full(seq.len()),
}
}
#[inline]
pub fn trim_adapter_targeted(seq: &[u8], config: &AdapterConfig, is_read2: bool) -> TrimResult {
if seq.is_empty() {
return TrimResult::empty();
}
let (primary_adapter, secondary_adapter) = if is_read2 {
(&config.adapter_r2, &config.adapter_r1)
} else {
(&config.adapter_r1, &config.adapter_r2)
};
if let Some(ref adapter) = primary_adapter {
if let Some(pos) = detect_adapter(seq, adapter, config.max_mismatch) {
if pos >= config.min_match_length {
return TrimResult::new(0, pos);
}
}
}
if let Some(ref adapter) = secondary_adapter {
if let Some(pos) = detect_adapter(seq, adapter, config.max_mismatch) {
if pos >= config.min_match_length {
return TrimResult::new(0, pos);
}
}
}
for (_name, adapter) in &config.adapter_list {
if let Some(pos) = detect_adapter(seq, adapter, config.max_mismatch) {
if pos >= config.min_match_length {
return TrimResult::new(0, pos);
}
}
}
TrimResult::full(seq.len())
}
pub fn detect_adapter_from_overlap(
r1: &[u8],
r2: &[u8],
min_overlap: usize,
) -> Option<(Vec<u8>, Vec<u8>)> {
if r1.len() < min_overlap || r2.len() < min_overlap {
return None;
}
let r2_rc = reverse_complement(r2);
for overlap_len in min_overlap..=r1.len().min(r2_rc.len()) {
let r1_start = r1.len() - overlap_len;
let r1_overlap = &r1[r1_start..];
let r2_overlap = &r2_rc[..overlap_len];
let mismatches: usize = r1_overlap
.iter()
.zip(r2_overlap.iter())
.map(|(a, b)| if a != b { 1 } else { 0 })
.sum();
let max_allowed = overlap_len / 10;
if mismatches <= max_allowed {
let adapter_r1 = if r1.len() > overlap_len {
r2_rc[overlap_len..].to_vec()
} else {
Vec::new()
};
let adapter_r2 = if r2.len() > overlap_len {
reverse_complement(&r1[..r1_start])
} else {
Vec::new()
};
if !adapter_r1.is_empty() || !adapter_r2.is_empty() {
return Some((adapter_r1, adapter_r2));
}
}
}
None
}
#[inline]
pub fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| match b {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'N' | b'n' => b'N',
_ => b'N',
})
.collect()
}
pub fn parse_adapter_fasta(path: &std::path::Path) -> std::io::Result<Vec<(String, Vec<u8>)>> {
use std::fs::File;
use std::io::{BufRead, BufReader};
let file = File::open(path)?;
let reader = BufReader::new(file);
let mut adapters = Vec::new();
let mut current_name: Option<String> = None;
let mut current_seq = Vec::new();
for line in reader.lines() {
let line = line?;
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if trimmed.starts_with('>') {
if let Some(name) = current_name.take() {
if !current_seq.is_empty() {
adapters.push((name, current_seq.clone()));
current_seq.clear();
}
}
current_name = Some(trimmed[1..].trim().to_string());
} else {
current_seq.extend_from_slice(trimmed.as_bytes());
}
}
if let Some(name) = current_name {
if !current_seq.is_empty() {
adapters.push((name, current_seq));
}
}
Ok(adapters)
}
pub type AdapterTrimmer = AdapterConfig;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_adapter_config_default() {
let config = AdapterConfig::default();
assert!(config.adapter_r1.is_some());
assert!(config.adapter_r2.is_some());
assert!(!config.auto_detect);
assert_eq!(config.min_overlap, 30);
}
#[test]
fn test_adapter_config_disabled() {
let config = AdapterConfig::disabled();
assert!(config.adapter_r1.is_none());
assert!(config.adapter_r2.is_none());
}
#[test]
fn test_adapter_config_truseq() {
let config = AdapterConfig::truseq();
assert_eq!(config.adapter_r1.as_deref(), Some(ILLUMINA_TRUSEQ_R1));
assert_eq!(config.adapter_r2.as_deref(), Some(ILLUMINA_TRUSEQ_R2));
}
#[test]
fn test_adapter_config_nextera() {
let config = AdapterConfig::nextera();
assert_eq!(config.adapter_r1.as_deref(), Some(NEXTERA_R1));
assert_eq!(config.adapter_r2.as_deref(), Some(NEXTERA_R2));
}
#[test]
fn test_detect_adapter_exact() {
let adapter = b"AGATCGGAAGAG";
let seq = b"ACGTACGTACGTACGTAAAAAAAGATCGGAAGAG";
let pos = detect_adapter(seq, adapter, 1);
assert!(pos.is_some());
assert!(pos.unwrap() > 0);
}
#[test]
fn test_detect_adapter_partial() {
let adapter = b"AGATCGGAAGAGCACACGT";
let seq = b"ACGTACGTACGTACGTAAAAAAAGATCGGA"; let pos = detect_adapter(seq, adapter, 1);
assert!(pos.is_some());
}
#[test]
fn test_detect_adapter_not_found() {
let adapter = b"AGATCGGAAGAG";
let seq = b"ACGTACGTACGTACGT"; let pos = detect_adapter(seq, adapter, 0);
assert!(pos.is_none());
}
#[test]
fn test_detect_adapter_empty() {
assert!(detect_adapter(&[], b"AGATC", 0).is_none());
assert!(detect_adapter(b"ACGT", &[], 0).is_none());
}
#[test]
fn test_trim_adapter() {
let adapter = b"AGATCGGAAGAG";
let seq = b"ACGTACGTACGTACGTAAAAAAAGATCGGAAGAG";
let config = AdapterConfig::new()
.with_adapter_r1(adapter.to_vec())
.with_max_mismatch(1);
let result = trim_adapter(seq, &config);
assert!(result.end < seq.len());
}
#[test]
fn test_trim_adapter_empty() {
let config = AdapterConfig::default();
let result = trim_adapter(&[], &config);
assert!(result.is_empty());
}
#[test]
fn test_trim_adapter_no_adapter_found() {
let seq = b"ACGTACGTACGTACGT";
let config = AdapterConfig::disabled();
let result = trim_adapter(seq, &config);
assert_eq!(result.start, 0);
assert_eq!(result.end, seq.len());
}
#[test]
fn test_reverse_complement() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b"GCGC"), b"GCGC");
assert_eq!(reverse_complement(b"ATCGATCG"), b"CGATCGAT");
assert_eq!(reverse_complement(b""), b"");
assert_eq!(reverse_complement(b"N"), b"N");
}
#[test]
fn test_reverse_complement_lowercase() {
assert_eq!(reverse_complement(b"acgt"), b"ACGT");
assert_eq!(reverse_complement(b"AcGt"), b"ACGT");
}
#[test]
fn test_detect_adapter_from_overlap_short() {
let result = detect_adapter_from_overlap(b"ACGT", b"ACGT", 30);
assert!(result.is_none());
}
#[test]
fn test_config_builder() {
let config = AdapterConfig::new()
.with_adapter_r1(b"ACGT".to_vec())
.with_adapter_r2(b"TGCA".to_vec())
.with_auto_detect(true)
.with_min_overlap(20)
.with_max_mismatch(5);
assert_eq!(config.adapter_r1.as_deref(), Some(b"ACGT".as_slice()));
assert_eq!(config.adapter_r2.as_deref(), Some(b"TGCA".as_slice()));
assert!(config.auto_detect);
assert_eq!(config.min_overlap, 20);
assert_eq!(config.max_mismatch, 5);
}
#[test]
fn test_adapter_constants() {
assert!(ILLUMINA_TRUSEQ_R1.len() > 20);
assert!(ILLUMINA_TRUSEQ_R2.len() > 20);
assert!(NEXTERA_R1.len() > 10);
assert!(NEXTERA_R2.len() > 10);
assert!(BGI_R1.len() > 20);
assert!(BGI_R2.len() > 20);
}
}