use std::collections::HashMap;
use scirs2_core::bioinformatics::alignment::{
edit_distance as core_edit_distance, needleman_wunsch as core_nw, smith_waterman as core_sw,
};
use scirs2_core::bioinformatics::phylo::{
distance_matrix as core_distance_matrix, hamming_distance as core_hamming,
jukes_cantor_distance as core_jc,
};
use scirs2_core::bioinformatics::sequence::gc_content as core_gc_content;
use scirs2_core::bioinformatics::stats::{
codon_usage_table as core_codon_usage, nucleotide_frequencies as core_nuc_freq,
};
use crate::error::{NumRs2Error, Result};
fn map_core_err(e: scirs2_core::error::CoreError) -> NumRs2Error {
NumRs2Error::ComputationError(format!("scirs2-core bioinformatics error: {e}"))
}
#[derive(Debug, Clone, PartialEq)]
pub struct AlignmentResult {
pub score: f64,
pub aligned_seq1: String,
pub aligned_seq2: String,
}
pub fn needleman_wunsch(
seq1: &str,
seq2: &str,
match_score: i32,
mismatch: i32,
gap: i32,
) -> Result<AlignmentResult> {
let (score, a1, a2) = core_nw(seq1.as_bytes(), seq2.as_bytes(), match_score, mismatch, gap)
.map_err(map_core_err)?;
Ok(AlignmentResult {
score: score as f64,
aligned_seq1: a1,
aligned_seq2: a2,
})
}
pub fn smith_waterman(
seq1: &str,
seq2: &str,
match_score: i32,
mismatch: i32,
gap: i32,
) -> Result<AlignmentResult> {
let (score, a1, a2) = core_sw(seq1.as_bytes(), seq2.as_bytes(), match_score, mismatch, gap)
.map_err(map_core_err)?;
Ok(AlignmentResult {
score: score as f64,
aligned_seq1: a1,
aligned_seq2: a2,
})
}
pub fn edit_distance(seq1: &str, seq2: &str) -> usize {
core_edit_distance(seq1.as_bytes(), seq2.as_bytes())
}
pub fn hamming_distance(seq1: &str, seq2: &str) -> Result<usize> {
core_hamming(seq1.as_bytes(), seq2.as_bytes()).map_err(map_core_err)
}
pub fn jukes_cantor_distance(p: f64) -> Result<f64> {
core_jc(p).map_err(map_core_err)
}
pub fn distance_matrix(sequences: &[&str]) -> Result<Vec<Vec<f64>>> {
let byte_seqs: Vec<&[u8]> = sequences.iter().map(|s| s.as_bytes()).collect();
let arr = core_distance_matrix(&byte_seqs).map_err(map_core_err)?;
let n = sequences.len();
let mut mat = vec![vec![0.0f64; n]; n];
for i in 0..n {
for j in 0..n {
mat[i][j] = arr[[i, j]];
}
}
Ok(mat)
}
pub fn nucleotide_frequencies(seq: &str) -> HashMap<char, f64> {
let raw = core_nuc_freq(seq.as_bytes()); let mut map = HashMap::with_capacity(4);
map.insert('A', raw[0]);
map.insert('C', raw[1]);
map.insert('G', raw[2]);
map.insert('T', raw[3]);
map
}
pub fn gc_content(seq: &str) -> f64 {
core_gc_content(seq.as_bytes())
}
pub fn codon_usage(seq: &str) -> HashMap<String, usize> {
let bytes = seq.as_bytes();
let num_codons = bytes.len() / 3;
let mut counts: HashMap<String, usize> = HashMap::new();
for i in (0..bytes.len().saturating_sub(2)).step_by(3) {
let codon = &bytes[i..i + 3];
let valid = codon
.iter()
.all(|&b| matches!(b.to_ascii_uppercase(), b'A' | b'C' | b'G' | b'T'));
if valid {
let key = String::from_utf8_lossy(codon).to_ascii_uppercase();
*counts.entry(key).or_insert(0) += 1;
}
}
let _ = num_codons;
counts
}
pub fn complement(seq: &str) -> String {
let rc = scirs2_core::bioinformatics::sequence::complement(seq.as_bytes());
String::from_utf8_lossy(&rc).into_owned()
}
pub fn reverse_complement(seq: &str) -> String {
let rc = scirs2_core::bioinformatics::sequence::reverse_complement(seq.as_bytes());
String::from_utf8_lossy(&rc).into_owned()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_edit_distance_identical() {
assert_eq!(edit_distance("ATGC", "ATGC"), 0);
}
#[test]
fn test_edit_distance_single_substitution() {
assert_eq!(edit_distance("ATGC", "ATCC"), 1);
}
#[test]
fn test_edit_distance_insertion_deletion() {
assert_eq!(edit_distance("kitten", "sitting"), 3);
}
#[test]
fn test_edit_distance_empty_sequences() {
assert_eq!(edit_distance("", ""), 0);
assert_eq!(edit_distance("ABC", ""), 3);
assert_eq!(edit_distance("", "XYZ"), 3);
}
#[test]
fn test_edit_distance_biological_sequences() {
assert_eq!(edit_distance("AGCT", "AGT"), 1);
assert_eq!(edit_distance("AAAAAA", "AAGGAA"), 2);
}
#[test]
fn test_needleman_wunsch_identical() {
let result = needleman_wunsch("AGCT", "AGCT", 1, -1, -2).expect("alignment should succeed");
assert_eq!(result.score, 4.0);
assert_eq!(result.aligned_seq1, "AGCT");
assert_eq!(result.aligned_seq2, "AGCT");
}
#[test]
fn test_needleman_wunsch_with_gap() {
let result = needleman_wunsch("AGCT", "AGT", 1, -1, -2).expect("alignment should succeed");
assert_eq!(result.aligned_seq1.len(), result.aligned_seq2.len());
assert!(result.score > 0.0);
let gaps1 = result.aligned_seq1.chars().filter(|&c| c == '-').count();
let gaps2 = result.aligned_seq2.chars().filter(|&c| c == '-').count();
assert_eq!(gaps1 + gaps2, 1, "expected exactly one gap total");
}
#[test]
fn test_needleman_wunsch_empty_sequences() {
let result = needleman_wunsch("", "", 1, -1, -2).expect("empty alignment should succeed");
assert_eq!(result.score, 0.0);
assert_eq!(result.aligned_seq1, "");
assert_eq!(result.aligned_seq2, "");
}
#[test]
fn test_needleman_wunsch_one_empty() {
let result =
needleman_wunsch("ATGC", "", 1, -1, -2).expect("one-empty alignment should succeed");
assert_eq!(result.aligned_seq1.len(), result.aligned_seq2.len());
assert!(result.aligned_seq2.chars().all(|c| c == '-'));
}
#[test]
fn test_smith_waterman_local_alignment() {
let result =
smith_waterman("TGTTACGG", "GGTTGACTA", 3, -3, -2).expect("alignment should succeed");
assert!(result.score > 0.0);
assert_eq!(result.aligned_seq1.len(), result.aligned_seq2.len());
}
#[test]
fn test_smith_waterman_no_match() {
let result = smith_waterman("AAAA", "TTTT", 1, -5, -5).expect("alignment should succeed");
assert!(result.score >= 0.0);
}
#[test]
fn test_hamming_distance_identical() {
assert_eq!(hamming_distance("ATGC", "ATGC").expect("equal lengths"), 0);
}
#[test]
fn test_hamming_distance_two_differences() {
assert_eq!(hamming_distance("ATGC", "TTGT").expect("equal lengths"), 2);
}
#[test]
fn test_hamming_distance_length_mismatch() {
assert!(
hamming_distance("ATGC", "ATG").is_err(),
"unequal lengths must return an error"
);
}
#[test]
fn test_hamming_distance_empty() {
assert_eq!(hamming_distance("", "").expect("empty sequences"), 0);
}
#[test]
fn test_jukes_cantor_zero_p() {
let d = jukes_cantor_distance(0.0).expect("p=0 is valid");
assert!((d - 0.0).abs() < 1e-10);
}
#[test]
fn test_jukes_cantor_known_value() {
let d = jukes_cantor_distance(0.1).expect("p=0.1 is valid");
let expected = -0.75 * (1.0 - (4.0 / 3.0) * 0.1_f64).ln();
assert!(
(d - expected).abs() < 1e-10,
"expected d ≈ {expected}, got {d}"
);
}
#[test]
fn test_jukes_cantor_invalid_p() {
assert!(jukes_cantor_distance(0.75).is_err(), "p=0.75 is invalid");
assert!(jukes_cantor_distance(1.0).is_err(), "p=1.0 is invalid");
assert!(jukes_cantor_distance(-0.1).is_err(), "p<0 is invalid");
}
#[test]
fn test_distance_matrix_basic() {
let seqs = &["ATGC", "ATGC", "TTGT"];
let mat = distance_matrix(seqs).expect("valid sequences");
assert_eq!(mat.len(), 3);
assert_eq!(mat[0].len(), 3);
assert!((mat[0][0] - 0.0).abs() < 1e-10);
assert!((mat[1][1] - 0.0).abs() < 1e-10);
assert!((mat[2][2] - 0.0).abs() < 1e-10);
assert!((mat[0][1] - mat[1][0]).abs() < 1e-10);
assert!((mat[0][2] - mat[2][0]).abs() < 1e-10);
assert!((mat[0][1] - 0.0).abs() < 1e-10);
}
#[test]
fn test_distance_matrix_empty_error() {
let seqs: &[&str] = &[];
assert!(distance_matrix(seqs).is_err());
}
#[test]
fn test_distance_matrix_different_lengths_error() {
let seqs = &["ATGC", "ATG"];
assert!(distance_matrix(seqs).is_err());
}
#[test]
fn test_nucleotide_frequencies_basic() {
let freqs = nucleotide_frequencies("AACGT");
assert!(
(freqs[&'A'] - 0.4).abs() < 1e-10,
"A frequency wrong: {}",
freqs[&'A']
);
assert!(
(freqs[&'C'] - 0.2).abs() < 1e-10,
"C frequency wrong: {}",
freqs[&'C']
);
assert!(
(freqs[&'G'] - 0.2).abs() < 1e-10,
"G frequency wrong: {}",
freqs[&'G']
);
assert!(
(freqs[&'T'] - 0.2).abs() < 1e-10,
"T frequency wrong: {}",
freqs[&'T']
);
}
#[test]
fn test_nucleotide_frequencies_sum_to_one() {
let freqs = nucleotide_frequencies("ATGCATGCATGC");
let sum: f64 = freqs.values().sum();
assert!((sum - 1.0).abs() < 1e-10, "frequencies must sum to 1.0");
}
#[test]
fn test_nucleotide_frequencies_empty() {
let freqs = nucleotide_frequencies("");
assert!((freqs[&'A'] - 0.0).abs() < 1e-10);
assert!((freqs[&'C'] - 0.0).abs() < 1e-10);
assert!((freqs[&'G'] - 0.0).abs() < 1e-10);
assert!((freqs[&'T'] - 0.0).abs() < 1e-10);
}
#[test]
fn test_gc_content_fifty_percent() {
assert!((gc_content("AATGCG") - 0.5).abs() < 1e-10);
}
#[test]
fn test_gc_content_zero() {
assert!((gc_content("AAAA") - 0.0).abs() < 1e-10);
}
#[test]
fn test_gc_content_one_hundred_percent() {
assert!((gc_content("GCGC") - 1.0).abs() < 1e-10);
}
#[test]
fn test_gc_content_empty() {
assert!((gc_content("") - 0.0).abs() < 1e-10);
}
#[test]
fn test_codon_usage_single_codon_repeated() {
let usage = codon_usage("ATGATGATG");
assert_eq!(*usage.get("ATG").unwrap_or(&0), 3);
}
#[test]
fn test_codon_usage_multiple_codons() {
let usage = codon_usage("ATGCTT");
assert_eq!(*usage.get("ATG").unwrap_or(&0), 1);
assert_eq!(*usage.get("CTT").unwrap_or(&0), 1);
}
#[test]
fn test_codon_usage_skips_invalid_codons() {
let usage = codon_usage("ATGNNN");
assert_eq!(*usage.get("ATG").unwrap_or(&0), 1);
assert!(!usage.contains_key("NNN"));
}
#[test]
fn test_codon_usage_empty() {
let usage = codon_usage("");
assert!(usage.is_empty());
}
#[test]
fn test_complement_basic() {
assert_eq!(complement("ATGC"), "TACG");
}
#[test]
fn test_reverse_complement_basic() {
assert_eq!(reverse_complement("ATGCTT"), "AAGCAT");
}
}