#![allow(dead_code)]
use crate::seq_utils::complement;
pub const SUBSTITUTION_TYPES: [&str; 6] = ["C>A", "C>G", "C>T", "T>A", "T>C", "T>G"];
pub const CONTEXTS: [[u8; 2]; 16] = [
[b'A', b'A'],
[b'A', b'C'],
[b'A', b'G'],
[b'A', b'T'],
[b'C', b'A'],
[b'C', b'C'],
[b'C', b'G'],
[b'C', b'T'],
[b'G', b'A'],
[b'G', b'C'],
[b'G', b'G'],
[b'G', b'T'],
[b'T', b'A'],
[b'T', b'C'],
[b'T', b'G'],
[b'T', b'T'],
];
pub fn channel_index(sub_type: usize, context_5p: u8, context_3p: u8) -> usize {
let ctx_idx = base_to_idx(context_5p) * 4 + base_to_idx(context_3p);
sub_type * 16 + ctx_idx
}
fn base_to_idx(base: u8) -> usize {
match base.to_ascii_uppercase() {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' => 3,
_ => 0,
}
}
pub fn sbs96_index(context_5p: u8, ref_base: u8, alt_base: u8, context_3p: u8) -> Option<usize> {
let (sub_type, ctx_5p, ctx_3p) =
normalise_to_pyrimidine(context_5p, ref_base, alt_base, context_3p)?;
Some(channel_index(sub_type, ctx_5p, ctx_3p))
}
fn normalise_to_pyrimidine(
ctx_5p: u8,
ref_base: u8,
alt_base: u8,
ctx_3p: u8,
) -> Option<(usize, u8, u8)> {
let r = ref_base.to_ascii_uppercase();
let a = alt_base.to_ascii_uppercase();
if matches!(r, b'C' | b'T') {
let sub_type = match (r, a) {
(b'C', b'A') => 0,
(b'C', b'G') => 1,
(b'C', b'T') => 2,
(b'T', b'A') => 3,
(b'T', b'C') => 4,
(b'T', b'G') => 5,
_ => return None,
};
Some((
sub_type,
ctx_5p.to_ascii_uppercase(),
ctx_3p.to_ascii_uppercase(),
))
} else {
let r_rc = complement(r);
let a_rc = complement(a);
let sub_type = match (r_rc, a_rc) {
(b'C', b'A') => 0,
(b'C', b'G') => 1,
(b'C', b'T') => 2,
(b'T', b'A') => 3,
(b'T', b'C') => 4,
(b'T', b'G') => 5,
_ => return None,
};
Some((
sub_type,
complement(ctx_3p).to_ascii_uppercase(),
complement(ctx_5p).to_ascii_uppercase(),
))
}
}
pub fn trinucleotide_context(region_seq: &[u8], pos_in_region: usize) -> Option<(u8, u8, u8)> {
if pos_in_region == 0 || pos_in_region + 1 >= region_seq.len() {
return None;
}
Some((
region_seq[pos_in_region - 1],
region_seq[pos_in_region],
region_seq[pos_in_region + 1],
))
}
pub fn variant_sbs96_index(
region_seq: &[u8],
pos_in_region: usize,
ref_base: u8,
alt_base: u8,
) -> Option<usize> {
let (ctx_5p, _central, ctx_3p) = trinucleotide_context(region_seq, pos_in_region)?;
sbs96_index(ctx_5p, ref_base, alt_base, ctx_3p)
}
pub type SignatureVec = [f64; 96];
pub fn builtin_signature(name: &str) -> Option<&'static SignatureVec> {
match name {
"SBS1" => Some(&SBS1),
"SBS2" => Some(&SBS2),
"SBS3" => Some(&SBS3),
"SBS4" => Some(&SBS4),
"SBS5" => Some(&SBS5),
"SBS7a" => Some(&SBS7A),
"SBS13" => Some(&SBS13),
"SBS17a" => Some(&SBS17A),
_ => None,
}
}
pub fn builtin_signature_names() -> &'static [&'static str] {
&[
"SBS1", "SBS2", "SBS3", "SBS4", "SBS5", "SBS7a", "SBS13", "SBS17a",
]
}
pub static SBS1: SignatureVec = {
const CG_WEIGHT: f64 = 0.10;
const REMAINDER: f64 = 1.0 - 4.0 * CG_WEIGHT;
const OTHER: f64 = REMAINDER / 92.0;
let mut v = [OTHER; 96];
v[34] = CG_WEIGHT;
v[38] = CG_WEIGHT;
v[42] = CG_WEIGHT;
v[46] = CG_WEIGHT;
v
};
pub static SBS2: SignatureVec = {
const TCA_WEIGHT: f64 = 0.40;
const TCT_WEIGHT: f64 = 0.40;
const REMAINDER: f64 = 1.0 - TCA_WEIGHT - TCT_WEIGHT;
const OTHER: f64 = REMAINDER / 94.0;
let mut v = [OTHER; 96];
v[44] = TCA_WEIGHT;
v[47] = TCT_WEIGHT;
v
};
pub static SBS13: SignatureVec = {
const TCA_WEIGHT: f64 = 0.40;
const TCT_WEIGHT: f64 = 0.40;
const REMAINDER: f64 = 1.0 - TCA_WEIGHT - TCT_WEIGHT;
const OTHER: f64 = REMAINDER / 94.0;
let mut v = [OTHER; 96];
v[28] = TCA_WEIGHT;
v[31] = TCT_WEIGHT;
v
};
pub static SBS4: SignatureVec = {
const GCN_WEIGHT: f64 = 0.15;
const REMAINDER: f64 = 1.0 - 4.0 * GCN_WEIGHT;
const OTHER: f64 = REMAINDER / 92.0;
let mut v = [OTHER; 96];
v[8] = GCN_WEIGHT; v[9] = GCN_WEIGHT; v[10] = GCN_WEIGHT; v[11] = GCN_WEIGHT; v
};
pub static SBS5: SignatureVec = [1.0 / 96.0; 96];
pub static SBS7A: SignatureVec = {
const CC_WEIGHT: f64 = 0.15;
const REMAINDER: f64 = 1.0 - 4.0 * CC_WEIGHT;
const OTHER: f64 = REMAINDER / 92.0;
let mut v = [OTHER; 96];
v[36] = CC_WEIGHT; v[37] = CC_WEIGHT; v[38] = CC_WEIGHT; v[39] = CC_WEIGHT; v
};
pub static SBS17A: SignatureVec = {
const CTT_WEIGHT: f64 = 0.30;
const OTHER_TG_WEIGHT: f64 = 0.40 / 15.0;
const BASE_WEIGHT: f64 = 0.30 / 80.0;
let mut v = [BASE_WEIGHT; 96];
let mut i = 80; while i < 96 {
v[i] = OTHER_TG_WEIGHT;
i += 1;
}
v[87] = CTT_WEIGHT;
v
};
pub static SBS3: SignatureVec = {
const CT_WEIGHT: f64 = 0.013;
const BASE_WEIGHT: f64 = (1.0 - 16.0 * CT_WEIGHT) / 80.0;
let mut v = [BASE_WEIGHT; 96];
let mut i = 32; while i < 48 {
v[i] = CT_WEIGHT;
i += 1;
}
v
};
#[cfg(test)]
mod tests {
use super::*;
const TOL: f64 = 1e-10;
fn sum_sig(sig: &SignatureVec) -> f64 {
sig.iter().sum()
}
#[test]
fn test_sbs1_sums_to_one() {
let s = sum_sig(&SBS1);
assert!((s - 1.0).abs() < TOL, "SBS1 sums to {}", s);
}
#[test]
fn test_sbs2_sums_to_one() {
let s = sum_sig(&SBS2);
assert!((s - 1.0).abs() < TOL, "SBS2 sums to {}", s);
}
#[test]
fn test_sbs3_sums_to_one() {
let s = sum_sig(&SBS3);
assert!((s - 1.0).abs() < TOL, "SBS3 sums to {}", s);
}
#[test]
fn test_sbs4_sums_to_one() {
let s = sum_sig(&SBS4);
assert!((s - 1.0).abs() < TOL, "SBS4 sums to {}", s);
}
#[test]
fn test_sbs5_sums_to_one() {
let s = sum_sig(&SBS5);
assert!((s - 1.0).abs() < TOL, "SBS5 sums to {}", s);
}
#[test]
fn test_sbs7a_sums_to_one() {
let s = sum_sig(&SBS7A);
assert!((s - 1.0).abs() < TOL, "SBS7a sums to {}", s);
}
#[test]
fn test_sbs13_sums_to_one() {
let s = sum_sig(&SBS13);
assert!((s - 1.0).abs() < TOL, "SBS13 sums to {}", s);
}
#[test]
fn test_sbs17a_sums_to_one() {
let s = sum_sig(&SBS17A);
assert!((s - 1.0).abs() < TOL, "SBS17a sums to {}", s);
}
#[test]
fn test_all_signatures_non_negative() {
let sigs: &[&SignatureVec] = &[&SBS1, &SBS2, &SBS3, &SBS4, &SBS5, &SBS7A, &SBS13, &SBS17A];
for sig in sigs {
for &v in sig.iter() {
assert!(v >= 0.0, "negative probability {}", v);
}
}
}
#[test]
fn test_channel_index_ct_tca() {
let idx = channel_index(2, b'T', b'A');
assert_eq!(idx, 44, "C>T in TCA should be channel 44");
}
#[test]
fn test_channel_index_ct_tcg() {
let idx = channel_index(2, b'T', b'G');
assert_eq!(idx, 46, "C>T in TCG should be channel 46");
}
#[test]
fn test_channel_index_tg_ctt() {
let idx = channel_index(5, b'C', b'T');
assert_eq!(idx, 87, "T>G in CTT should be channel 87");
}
#[test]
fn test_sbs96_index_pyrimidine_direct() {
let idx = sbs96_index(b'T', b'C', b'T', b'A');
assert_eq!(idx, Some(44));
}
#[test]
fn test_sbs96_index_purine_complement() {
let idx = sbs96_index(b'T', b'G', b'A', b'A');
assert_eq!(idx, Some(44));
}
#[test]
fn test_sbs96_index_invalid_substitution() {
let idx = sbs96_index(b'A', b'C', b'C', b'T');
assert_eq!(idx, None);
}
#[test]
fn test_trinucleotide_context_middle() {
let seq = b"ACGT";
assert_eq!(trinucleotide_context(seq, 1), Some((b'A', b'C', b'G')));
assert_eq!(trinucleotide_context(seq, 2), Some((b'C', b'G', b'T')));
}
#[test]
fn test_trinucleotide_context_boundary() {
let seq = b"ACGT";
assert_eq!(trinucleotide_context(seq, 0), None);
assert_eq!(trinucleotide_context(seq, 3), None);
}
#[test]
fn test_variant_sbs96_index_valid() {
let seq = b"TACG";
let idx = variant_sbs96_index(seq, 1, b'A', b'T');
assert!(idx.is_some());
}
#[test]
fn test_sbs1_cpg_channels_dominant() {
assert!((SBS1[34] - 0.10).abs() < 1e-12);
assert!((SBS1[38] - 0.10).abs() < 1e-12);
assert!((SBS1[42] - 0.10).abs() < 1e-12);
assert!((SBS1[46] - 0.10).abs() < 1e-12);
}
#[test]
fn test_sbs2_tca_tct_dominant() {
assert!((SBS2[44] - 0.40).abs() < 1e-12);
assert!((SBS2[47] - 0.40).abs() < 1e-12);
}
#[test]
fn test_sbs13_tca_tct_dominant() {
assert!((SBS13[28] - 0.40).abs() < 1e-12);
assert!((SBS13[31] - 0.40).abs() < 1e-12);
}
#[test]
fn test_sbs17a_ctt_dominant() {
assert!((SBS17A[87] - 0.30).abs() < 1e-12);
}
#[test]
fn test_builtin_signature_lookup() {
assert!(builtin_signature("SBS1").is_some());
assert!(builtin_signature("SBS5").is_some());
assert!(builtin_signature("UNKNOWN").is_none());
}
#[test]
fn test_builtin_signature_names_complete() {
let names = builtin_signature_names();
assert_eq!(names.len(), 8);
for name in names {
assert!(builtin_signature(name).is_some(), "name {} not found", name);
}
}
}