use crate::error::KmerError;
use crate::kmer::encoding::{encode_kmer, has_ambiguous_bases};
pub fn extract_kmers(sequence: &str, kmer_size: usize) -> Result<Vec<u64>, KmerError> {
if kmer_size == 0 {
return Err(KmerError::InvalidKmerSize(0));
}
let sequence_len = sequence.len();
if sequence_len < kmer_size {
return Err(KmerError::SequenceTooShort {
length: sequence_len,
min_required: kmer_size,
kmer_size,
});
}
if has_ambiguous_bases(sequence) {
return Ok(Vec::new());
}
let mut kmers = Vec::with_capacity(sequence_len - kmer_size + 1);
for i in 0..=sequence_len - kmer_size {
let kmer_str = &sequence[i..i + kmer_size];
match encode_kmer(kmer_str) {
Ok(encoded) => kmers.push(encoded),
Err(e) => {
eprintln!("Warning: Skipping k-mer at position {}: {}", i, e);
}
}
}
Ok(kmers)
}
pub fn extract_kmers_filter_ambiguous(sequence: &str, kmer_size: usize) -> Vec<u64> {
let sequence_len = sequence.len();
if sequence_len < kmer_size {
return Vec::new();
}
let mut kmers = Vec::new();
for i in 0..=sequence_len - kmer_size {
let kmer_str = &sequence[i..i + kmer_size];
if has_ambiguous_bases(kmer_str) {
continue; }
if let Ok(encoded) = encode_kmer(kmer_str) {
kmers.push(encoded);
}
}
kmers
}
pub fn filter_sequence(sequence: &str, kmer_size: usize) -> bool {
if sequence.len() < kmer_size {
return false;
}
sequence.chars().all(|ch| {
let ch = ch.to_ascii_uppercase();
matches!(ch, 'A' | 'C' | 'G' | 'T')
})
}
pub fn count_extractable_kmers(sequence: &str, kmer_size: usize) -> usize {
let sequence_len = sequence.len();
if sequence_len < kmer_size {
return 0;
}
let mut count = 0;
for i in 0..=sequence_len - kmer_size {
let kmer_str = &sequence[i..i + kmer_size];
if !has_ambiguous_bases(kmer_str) {
count += 1;
}
}
count
}
pub fn analyze_sequence_quality(sequence: &str) -> String {
let total_len = sequence.len();
let valid_bases = sequence
.chars()
.filter(|ch| matches!(ch.to_ascii_uppercase(), 'A' | 'C' | 'G' | 'T'))
.count();
let invalid_bases = total_len - valid_bases;
let ambiguous_count = sequence
.chars()
.filter(|ch| ch.eq_ignore_ascii_case(&'N'))
.count();
format!(
"Length: {} | Valid: {} | Invalid: {} | Ambiguous (N): {} | Quality: {:.1}%",
total_len,
valid_bases,
invalid_bases,
ambiguous_count,
if total_len > 0 {
(valid_bases as f64 / total_len as f64) * 100.0
} else {
0.0
}
)
}
pub fn reverse_complement_bits(kmer: u64, k: usize) -> Result<u64, KmerError> {
if k > 32 {
return Err(KmerError::InvalidKmerSize(k as u32));
}
let mut result = 0u64;
let mut input = kmer;
for _ in 0..k {
let base = input & 0b11;
let complement = base ^ 0b11;
result <<= 2;
result |= complement;
input >>= 2;
}
Ok(result)
}
pub fn reverse_complement(sequence: &str) -> String {
sequence
.chars()
.rev()
.map(|ch| match ch.to_ascii_uppercase() {
'A' => 'T',
'T' => 'A',
'G' => 'C',
'C' => 'G',
'N' => 'N', _ => 'N', })
.collect()
}
pub fn canonical_kmer(kmer_encoded: u64, k: usize) -> Result<u64, KmerError> {
let rev_comp = reverse_complement_bits(kmer_encoded, k)?;
Ok(if kmer_encoded <= rev_comp {
kmer_encoded
} else {
rev_comp
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_kmers_simple() {
let sequence = "ATGCGAT";
let kmers = extract_kmers(sequence, 3).unwrap();
assert_eq!(kmers.len(), 5); }
#[test]
fn test_extract_kmers_short_sequence() {
let sequence = "AT";
let result = extract_kmers(sequence, 3);
assert!(result.is_err());
}
#[test]
fn test_extract_kmers_zero_length() {
let result = extract_kmers("", 3);
assert!(result.is_err());
}
#[test]
fn test_extract_kmers_filter_ambiguous() {
let sequence = "ATGNCGAT";
let kmers = extract_kmers_filter_ambiguous(sequence, 3);
assert_eq!(kmers.len(), 3); }
#[test]
fn test_filter_sequence_valid() {
assert!(filter_sequence("ATGC", 3));
assert!(!filter_sequence("AT", 3));
assert!(!filter_sequence("ATXG", 3));
}
#[test]
fn test_filter_sequence_case_insensitive() {
assert!(filter_sequence("atgc", 3));
assert!(filter_sequence("ATGC", 3));
}
#[test]
fn test_count_extractable_kmers() {
let sequence = "ATGCGAT";
assert_eq!(count_extractable_kmers(sequence, 3), 5);
let sequence_with_amb = "ATGNCGAT"; assert_eq!(count_extractable_kmers(sequence_with_amb, 3), 3);
}
#[test]
fn test_analyze_sequence_quality() {
let quality = analyze_sequence_quality("ATGCNATG");
assert!(quality.contains("Length: 8"));
assert!(quality.contains("Ambiguous (N): 1"));
}
#[test]
fn test_extract_kmers_with_lowercase() {
let sequence = "atgc";
let kmers = extract_kmers(sequence, 3).unwrap();
assert_eq!(kmers.len(), 2);
}
#[test]
fn test_edge_case_exact_length() {
let sequence = "ATG";
let kmers = extract_kmers(sequence, 3).unwrap();
assert_eq!(kmers.len(), 1);
}
#[test]
fn test_performance_large_sequence() {
let sequence = "A".repeat(10000);
let count = count_extractable_kmers(&sequence, 31);
assert_eq!(count, 10000 - 31 + 1);
}
}