use std::collections::HashSet;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use super::{Invariant, InvariantContext, InvariantId, InvariantStatus};
use crate::models::bundle::{SynthesisBundle, SynthesisPayload};
const SELECT_AGENT_CLASSES: &[&str] = &["select-agent", "sap"];
const PANDEMIC_CLASSES: &[&str] = &["pandemic-pathogen", "pandemic", "pheic"];
const TOXIN_CLASSES: &[&str] = &["toxin", "toxin-gene"];
const VIRULENCE_CLASSES: &[&str] = &["virulence", "virulence-factor"];
const ABR_CLASSES: &[&str] = &["antibiotic-resistance", "card", "amr"];
const SYNBIO_CLASSES: &[&str] = &["synbio-part", "synbio", "igem", "addgene"];
fn dna_sequence(bundle: &SynthesisBundle) -> Option<&str> {
match &bundle.payload {
SynthesisPayload::Dna { sequence } => Some(sequence.as_str()),
_ => None,
}
}
fn hits_in_classes(ctx: &InvariantContext<'_>, classes: &[&str]) -> Vec<String> {
ctx.screening_hits
.iter()
.filter(|h| {
let hc = h.entry.hazard_class.to_ascii_lowercase();
classes.iter().any(|c| hc == *c)
})
.map(|h| format!("{} ({})", h.entry.id, h.entry.hazard_class))
.collect()
}
fn fail(reason: impl Into<String>) -> InvariantStatus {
InvariantStatus::Fail {
reason: reason.into(),
}
}
fn advisory(note: impl Into<String>) -> InvariantStatus {
InvariantStatus::Advisory { note: note.into() }
}
pub(crate) const DEFAULT_PROTEIN_KMER_K: usize = 5;
pub(crate) const DEFAULT_PROTEIN_KMER_THRESHOLD: f64 = 0.30;
fn profile_homology_engine(ctx: &InvariantContext<'_>) -> super::homology::KmerHomologyEngine {
let k = ctx.profile.protein_kmer_k.unwrap_or(DEFAULT_PROTEIN_KMER_K);
let threshold = ctx
.profile
.protein_kmer_threshold
.unwrap_or(DEFAULT_PROTEIN_KMER_THRESHOLD);
super::homology::KmerHomologyEngine::new(k, threshold)
}
fn protein_space_rescreen(
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
classes: &[&str],
) -> Vec<String> {
let Some(seq) = dna_sequence(bundle) else {
return Vec::new();
};
let engine = profile_homology_engine(ctx);
let frames = match translate_dna_sequence(seq) {
Ok(f) => f,
Err(_) => return Vec::new(),
};
let rc = revcomp(seq.to_ascii_uppercase().as_bytes());
let rc_str = String::from_utf8_lossy(&rc);
let rc_frames = match translate_dna_sequence(&rc_str) {
Ok(f) => f,
Err(_) => return Vec::new(),
};
let all_frame_strings: Vec<&str> = frames.iter().chain(rc_frames.iter()).collect();
let mut hits = Vec::new();
for h in ctx.screening_hits {
let hc = h.entry.hazard_class.to_ascii_lowercase();
if !classes.iter().any(|c| hc == *c) {
continue;
}
let pattern_frames = match translate_dna_sequence(&h.matched_text) {
Ok(f) => f,
Err(_) => continue,
};
let ref_frame_strings: Vec<&str> = pattern_frames.iter().collect();
use super::homology::HomologyEngine;
let matches = engine.scan(&all_frame_strings, &ref_frame_strings);
for m in matches {
hits.push(format!(
"{} ({}) protein-space hit frame {} {}={:.2}",
h.entry.id, h.entry.hazard_class, m.frame_index, m.method, m.similarity,
));
}
}
hits
}
fn combined_screen(
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
classes: &[&str],
) -> Vec<String> {
let mut all_hits = hits_in_classes(ctx, classes);
let protein_hits = protein_space_rescreen(bundle, ctx, classes);
let existing: HashSet<String> = all_hits.iter().cloned().collect();
for ph in protein_hits {
if !existing.contains(&ph) {
all_hits.push(ph);
}
}
all_hits
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TranslationFrames {
pub frame1: String,
pub frame2: String,
pub frame3: String,
}
impl TranslationFrames {
pub fn iter(&self) -> impl Iterator<Item = &str> {
[
self.frame1.as_str(),
self.frame2.as_str(),
self.frame3.as_str(),
]
.into_iter()
}
}
#[derive(Debug, Error, PartialEq, Eq)]
pub enum TranslateError {
#[error("invalid DNA base {base:?} at offset {offset}")]
InvalidBase {
base: char,
offset: usize,
},
}
pub fn translate_dna(
bundle: &SynthesisBundle,
) -> Option<Result<TranslationFrames, TranslateError>> {
match &bundle.payload {
SynthesisPayload::Dna { sequence } => Some(translate_dna_sequence(sequence)),
_ => None,
}
}
pub fn translate_dna_sequence(dna: &str) -> Result<TranslationFrames, TranslateError> {
let bytes = dna.as_bytes();
let mut normalized: Vec<u8> = Vec::with_capacity(bytes.len());
for (offset, &b) in bytes.iter().enumerate() {
let upper = b.to_ascii_uppercase();
match upper {
b'A' | b'C' | b'G' | b'T' | b'N' => normalized.push(upper),
_ => {
return Err(TranslateError::InvalidBase {
base: b as char,
offset,
})
}
}
}
Ok(TranslationFrames {
frame1: translate_frame(&normalized, 0),
frame2: translate_frame(&normalized, 1),
frame3: translate_frame(&normalized, 2),
})
}
fn translate_frame(dna: &[u8], offset: usize) -> String {
if offset >= dna.len() {
return String::new();
}
let mut out = String::with_capacity((dna.len() - offset) / 3);
let mut i = offset;
while i + 3 <= dna.len() {
let codon = [dna[i], dna[i + 1], dna[i + 2]];
out.push(translate_codon(codon));
i += 3;
}
out
}
fn translate_codon(c: [u8; 3]) -> char {
if c.contains(&b'N') {
return 'X';
}
match &c {
b"TTT" | b"TTC" => 'F',
b"TTA" | b"TTG" | b"CTT" | b"CTC" | b"CTA" | b"CTG" => 'L',
b"ATT" | b"ATC" | b"ATA" => 'I',
b"ATG" => 'M',
b"GTT" | b"GTC" | b"GTA" | b"GTG" => 'V',
b"TCT" | b"TCC" | b"TCA" | b"TCG" | b"AGT" | b"AGC" => 'S',
b"CCT" | b"CCC" | b"CCA" | b"CCG" => 'P',
b"ACT" | b"ACC" | b"ACA" | b"ACG" => 'T',
b"GCT" | b"GCC" | b"GCA" | b"GCG" => 'A',
b"TAT" | b"TAC" => 'Y',
b"TAA" | b"TAG" | b"TGA" => '*',
b"CAT" | b"CAC" => 'H',
b"CAA" | b"CAG" => 'Q',
b"AAT" | b"AAC" => 'N',
b"AAA" | b"AAG" => 'K',
b"GAT" | b"GAC" => 'D',
b"GAA" | b"GAG" => 'E',
b"TGT" | b"TGC" => 'C',
b"TGG" => 'W',
b"CGT" | b"CGC" | b"CGA" | b"CGG" | b"AGA" | b"AGG" => 'R',
b"GGT" | b"GGC" | b"GGA" | b"GGG" => 'G',
_ => 'X',
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct SelectAgentScreen;
impl Invariant for SelectAgentScreen {
fn id(&self) -> InvariantId {
InvariantId::D1
}
fn name(&self) -> &'static str {
"select_agent_screen"
}
fn evaluate(&self, _bundle: &SynthesisBundle) -> InvariantStatus {
InvariantStatus::Pass
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
if dna_sequence(bundle).is_none() {
return InvariantStatus::Pass;
}
let hits = combined_screen(bundle, ctx, SELECT_AGENT_CLASSES);
if hits.is_empty() {
InvariantStatus::Pass
} else {
fail(format!("select-agent hits: {}", hits.join(", ")))
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct PandemicPathogenScreen;
impl Invariant for PandemicPathogenScreen {
fn id(&self) -> InvariantId {
InvariantId::D2
}
fn name(&self) -> &'static str {
"pandemic_pathogen_screen"
}
fn evaluate(&self, _bundle: &SynthesisBundle) -> InvariantStatus {
InvariantStatus::Pass
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
if dna_sequence(bundle).is_none() {
return InvariantStatus::Pass;
}
let hits = combined_screen(bundle, ctx, PANDEMIC_CLASSES);
if hits.is_empty() {
InvariantStatus::Pass
} else {
fail(format!("pandemic-pathogen hits: {}", hits.join(", ")))
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ToxinGeneScreen;
impl Invariant for ToxinGeneScreen {
fn id(&self) -> InvariantId {
InvariantId::D3
}
fn name(&self) -> &'static str {
"toxin_gene_screen"
}
fn evaluate(&self, _bundle: &SynthesisBundle) -> InvariantStatus {
InvariantStatus::Pass
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
if dna_sequence(bundle).is_none() {
return InvariantStatus::Pass;
}
let hits = combined_screen(bundle, ctx, TOXIN_CLASSES);
if hits.is_empty() {
InvariantStatus::Pass
} else {
fail(format!("toxin hits: {}", hits.join(", ")))
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct VirulenceFactorScreen;
impl Invariant for VirulenceFactorScreen {
fn id(&self) -> InvariantId {
InvariantId::D4
}
fn name(&self) -> &'static str {
"virulence_factor_screen"
}
fn evaluate(&self, _bundle: &SynthesisBundle) -> InvariantStatus {
InvariantStatus::Pass
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
if dna_sequence(bundle).is_none() {
return InvariantStatus::Pass;
}
let hits = combined_screen(bundle, ctx, VIRULENCE_CLASSES);
if hits.is_empty() {
InvariantStatus::Pass
} else {
advisory(format!("virulence-factor hits: {}", hits.join(", ")))
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct AntibioticResistanceScreen;
impl Invariant for AntibioticResistanceScreen {
fn id(&self) -> InvariantId {
InvariantId::D5
}
fn name(&self) -> &'static str {
"antibiotic_resistance_screen"
}
fn evaluate(&self, _bundle: &SynthesisBundle) -> InvariantStatus {
InvariantStatus::Pass
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
if dna_sequence(bundle).is_none() {
return InvariantStatus::Pass;
}
let hits = combined_screen(bundle, ctx, ABR_CLASSES);
if hits.is_empty() {
InvariantStatus::Pass
} else {
fail(format!("antibiotic-resistance hits: {}", hits.join(", ")))
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct SynbioPartScreen;
impl Invariant for SynbioPartScreen {
fn id(&self) -> InvariantId {
InvariantId::D6
}
fn name(&self) -> &'static str {
"synbio_part_screen"
}
fn evaluate(&self, _bundle: &SynthesisBundle) -> InvariantStatus {
InvariantStatus::Pass
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
if dna_sequence(bundle).is_none() {
return InvariantStatus::Pass;
}
let hits = combined_screen(bundle, ctx, SYNBIO_CLASSES);
if hits.is_empty() {
InvariantStatus::Pass
} else {
advisory(format!("synbio-part hits: {}", hits.join(", ")))
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct CodonEntropyScreen;
impl Invariant for CodonEntropyScreen {
fn id(&self) -> InvariantId {
InvariantId::D7
}
fn name(&self) -> &'static str {
"codon_entropy_screen"
}
fn evaluate(&self, bundle: &SynthesisBundle) -> InvariantStatus {
let Some(seq) = dna_sequence(bundle) else {
return InvariantStatus::Pass;
};
let upper = seq.to_ascii_uppercase();
let bytes = upper.as_bytes();
let mut counts: std::collections::HashMap<[u8; 3], u32> = std::collections::HashMap::new();
let mut total = 0u32;
let mut i = 0;
while i + 3 <= bytes.len() {
let c = bytes[i];
let d = bytes[i + 1];
let e = bytes[i + 2];
if matches!(c, b'A' | b'C' | b'G' | b'T')
&& matches!(d, b'A' | b'C' | b'G' | b'T')
&& matches!(e, b'A' | b'C' | b'G' | b'T')
{
*counts.entry([c, d, e]).or_default() += 1;
total += 1;
}
i += 3;
}
if total < 10 {
return InvariantStatus::Pass;
}
let mut entropy = 0.0f64;
for &n in counts.values() {
let p = (n as f64) / (total as f64);
entropy -= p * p.log2();
}
if !(2.5..=5.8).contains(&entropy) {
advisory(format!(
"codon entropy {:.2} outside expected band [2.5, 5.8]",
entropy
))
} else {
InvariantStatus::Pass
}
}
fn evaluate_with(
&self,
bundle: &SynthesisBundle,
ctx: &InvariantContext<'_>,
) -> InvariantStatus {
let Some(seq) = dna_sequence(bundle) else {
return InvariantStatus::Pass;
};
let upper = seq.to_ascii_uppercase();
let bytes = upper.as_bytes();
let mut counts: std::collections::HashMap<[u8; 3], u32> = std::collections::HashMap::new();
let mut total = 0u32;
let mut i = 0;
while i + 3 <= bytes.len() {
let c = bytes[i];
let d = bytes[i + 1];
let e = bytes[i + 2];
if matches!(c, b'A' | b'C' | b'G' | b'T')
&& matches!(d, b'A' | b'C' | b'G' | b'T')
&& matches!(e, b'A' | b'C' | b'G' | b'T')
{
*counts.entry([c, d, e]).or_default() += 1;
total += 1;
}
i += 3;
}
if total < 10 {
return InvariantStatus::Pass;
}
let mut entropy = 0.0f64;
for &n in counts.values() {
let p = (n as f64) / (total as f64);
entropy -= p * p.log2();
}
let (lo, hi) = if let Some((lo, hi)) = ctx.profile.codon_entropy_band {
(lo, hi)
} else if let Some(ref org) = ctx.profile.codon_usage_organism {
organism_entropy_band(org)
} else {
(2.5, 5.8) };
if !(lo..=hi).contains(&entropy) {
return advisory(format!(
"codon entropy {:.2} outside expected band [{:.1}, {:.1}]",
entropy, lo, hi
));
}
if let Some(ref org) = ctx.profile.codon_usage_organism {
if let Some(table) = cutg_table(org) {
let (chi2, df) = codon_chi_squared(&counts, total, table);
if df > 0 {
let p_value = chi2_survival_approx(chi2, df);
if p_value < CHI2_P_VALUE_THRESHOLD {
return advisory(format!(
"codon usage deviates from {} CUTG (χ²={:.1}, df={}, p={:.2e})",
org, chi2, df, p_value
));
}
}
}
}
InvariantStatus::Pass
}
}
fn organism_entropy_band(organism: &str) -> (f64, f64) {
match organism {
"e_coli" => (3.0, 5.5),
"s_cerevisiae" => (2.8, 5.6),
"h_sapiens" => (3.2, 5.8),
"cho_k1" => (3.0, 5.7),
_ => (2.5, 5.8), }
}
type CutgTable = [f64; 64];
fn codon_to_index(codon: &[u8; 3]) -> usize {
fn base_val(b: u8) -> usize {
match b {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' => 3,
_ => 0, }
}
base_val(codon[0]) * 16 + base_val(codon[1]) * 4 + base_val(codon[2])
}
const CUTG_E_COLI: CutgTable = [
0.034, 0.022, 0.011, 0.018, 0.007, 0.023, 0.014, 0.009, 0.002, 0.016, 0.001, 0.009, 0.004, 0.025, 0.028, 0.024, 0.015, 0.010, 0.029, 0.012, 0.008, 0.005, 0.023, 0.007, 0.003, 0.022, 0.005, 0.021, 0.004, 0.011, 0.052, 0.011, 0.040, 0.019, 0.018, 0.032, 0.020, 0.026, 0.033, 0.015, 0.008, 0.029, 0.011, 0.025, 0.011, 0.015, 0.026, 0.018, 0.002, 0.012, 0.001, 0.016, 0.007, 0.009, 0.009, 0.015, 0.001, 0.006, 0.015, 0.005, 0.013, 0.016, 0.013, 0.022, ];
const CUTG_S_CEREVISIAE: CutgTable = [
0.042, 0.025, 0.031, 0.036, 0.018, 0.013, 0.008, 0.020, 0.021, 0.010, 0.009, 0.014, 0.018, 0.017, 0.021, 0.030, 0.027, 0.008, 0.012, 0.014, 0.018, 0.007, 0.005, 0.014, 0.003, 0.003, 0.002, 0.006, 0.014, 0.006, 0.011, 0.013, 0.046, 0.020, 0.019, 0.038, 0.016, 0.013, 0.006, 0.021, 0.011, 0.010, 0.006, 0.024, 0.012, 0.012, 0.011, 0.022, 0.001, 0.015, 0.001, 0.019, 0.019, 0.015, 0.004, 0.024, 0.001, 0.005, 0.010, 0.008, 0.026, 0.018, 0.027, 0.026, ];
const CUTG_H_SAPIENS: CutgTable = [
0.024, 0.019, 0.032, 0.017, 0.015, 0.019, 0.006, 0.013, 0.012, 0.020, 0.012, 0.012, 0.007, 0.021, 0.022, 0.016, 0.012, 0.015, 0.034, 0.011, 0.017, 0.020, 0.007, 0.017, 0.006, 0.011, 0.012, 0.005, 0.007, 0.020, 0.040, 0.013, 0.029, 0.025, 0.040, 0.022, 0.016, 0.028, 0.007, 0.018, 0.017, 0.022, 0.016, 0.011, 0.007, 0.015, 0.028, 0.011, 0.001, 0.015, 0.001, 0.012, 0.012, 0.018, 0.004, 0.015, 0.001, 0.013, 0.013, 0.010, 0.008, 0.020, 0.013, 0.017, ];
const CUTG_CHO_K1: CutgTable = [
0.025, 0.019, 0.031, 0.017, 0.015, 0.019, 0.006, 0.013, 0.012, 0.019, 0.012, 0.012, 0.008, 0.020, 0.022, 0.016, 0.013, 0.015, 0.034, 0.011, 0.017, 0.019, 0.007, 0.017, 0.006, 0.010, 0.011, 0.005, 0.007, 0.019, 0.039, 0.013, 0.029, 0.025, 0.039, 0.023, 0.016, 0.028, 0.007, 0.019, 0.016, 0.022, 0.016, 0.011, 0.007, 0.015, 0.028, 0.011, 0.001, 0.015, 0.001, 0.012, 0.012, 0.017, 0.004, 0.015, 0.001, 0.013, 0.013, 0.010, 0.008, 0.020, 0.013, 0.017, ];
fn cutg_table(organism: &str) -> Option<&'static CutgTable> {
match organism {
"e_coli" => Some(&CUTG_E_COLI),
"s_cerevisiae" => Some(&CUTG_S_CEREVISIAE),
"h_sapiens" => Some(&CUTG_H_SAPIENS),
"cho_k1" => Some(&CUTG_CHO_K1),
_ => None,
}
}
fn codon_chi_squared(
counts: &std::collections::HashMap<[u8; 3], u32>,
total: u32,
table: &CutgTable,
) -> (f64, usize) {
let n = total as f64;
let mut chi2 = 0.0;
let mut df = 0usize;
for (&codon, &observed) in counts {
let idx = codon_to_index(&codon);
let expected_freq = table[idx];
if expected_freq <= 0.0 {
continue;
}
let expected = expected_freq * n;
let diff = observed as f64 - expected;
chi2 += (diff * diff) / expected;
df += 1;
}
for (idx, &freq) in table.iter().enumerate() {
if freq <= 0.0 {
continue;
}
let c0 = [b'A', b'C', b'G', b'T'][(idx >> 4) & 3];
let c1 = [b'A', b'C', b'G', b'T'][(idx >> 2) & 3];
let c2 = [b'A', b'C', b'G', b'T'][idx & 3];
let codon = [c0, c1, c2];
if !counts.contains_key(&codon) {
let expected = freq * n;
chi2 += expected; df += 1;
}
}
(chi2, df.saturating_sub(1))
}
fn chi2_survival_approx(x: f64, df: usize) -> f64 {
if df == 0 || x <= 0.0 {
return 1.0;
}
let k = df as f64;
let z = ((x / k).powf(1.0 / 3.0) - (1.0 - 2.0 / (9.0 * k))) / (2.0 / (9.0 * k)).sqrt();
let p = normal_survival(z);
p.clamp(0.0, 1.0)
}
fn normal_survival(z: f64) -> f64 {
if z < -8.0 {
return 1.0;
}
if z > 8.0 {
return 0.0;
}
let abs_z = z.abs();
let t = 1.0 / (1.0 + 0.2316419 * abs_z);
let d = 0.3989422804014327; let p = d
* (-abs_z * abs_z / 2.0).exp()
* (t * (0.319381530
+ t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429)))));
if z > 0.0 {
p
} else {
1.0 - p
}
}
const CHI2_P_VALUE_THRESHOLD: f64 = 1e-4;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct GcContentScreen;
impl Invariant for GcContentScreen {
fn id(&self) -> InvariantId {
InvariantId::D8
}
fn name(&self) -> &'static str {
"gc_content_screen"
}
fn evaluate(&self, bundle: &SynthesisBundle) -> InvariantStatus {
let Some(seq) = dna_sequence(bundle) else {
return InvariantStatus::Pass;
};
let upper = seq.to_ascii_uppercase();
let bytes = upper.as_bytes();
let win = 100usize.min(bytes.len().max(1));
if bytes.is_empty() {
return InvariantStatus::Pass;
}
let mut worst: Option<(usize, f64)> = None;
let mut start = 0usize;
while start + win <= bytes.len() {
let slice = &bytes[start..start + win];
let canonical = slice
.iter()
.filter(|b| matches!(*b, b'A' | b'C' | b'G' | b'T'))
.count();
if canonical == 0 {
start += win;
continue;
}
let gc = slice.iter().filter(|b| matches!(*b, b'G' | b'C')).count();
let frac = gc as f64 / canonical as f64;
if !(0.25..=0.75).contains(&frac) {
worst = Some((start, frac));
break;
}
start += win;
}
if let Some((offset, frac)) = worst {
fail(format!(
"GC content {:.2} at offset {} outside synthesizable [0.25, 0.75] window",
frac, offset
))
} else {
InvariantStatus::Pass
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct SecondaryStructureScreen;
impl Invariant for SecondaryStructureScreen {
fn id(&self) -> InvariantId {
InvariantId::D9
}
fn name(&self) -> &'static str {
"secondary_structure_screen"
}
fn evaluate(&self, bundle: &SynthesisBundle) -> InvariantStatus {
let Some(seq) = dna_sequence(bundle) else {
return InvariantStatus::Pass;
};
let upper = seq.to_ascii_uppercase();
let bytes = upper.as_bytes();
const W: usize = 20;
if bytes.len() < 2 * W + 4 {
return InvariantStatus::Pass;
}
let mut rc_index: std::collections::HashMap<Vec<u8>, usize> =
std::collections::HashMap::new();
for i in 0..=bytes.len() - W {
let win = &bytes[i..i + W];
if win.iter().any(|b| !matches!(b, b'A' | b'C' | b'G' | b'T')) {
continue;
}
let rc = revcomp(win);
if let Some(&j) = rc_index.get(win) {
if i.saturating_sub(j + W) >= 4 {
return fail(format!(
"hairpin candidate: window at offset {j} reverse-complements window at offset {i}"
));
}
}
rc_index.insert(rc, i);
}
InvariantStatus::Pass
}
}
fn revcomp(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|b| match b {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
other => *other,
})
.collect()
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct AssemblyCompatibilityScreen;
impl Invariant for AssemblyCompatibilityScreen {
fn id(&self) -> InvariantId {
InvariantId::D10
}
fn name(&self) -> &'static str {
"assembly_compatibility_screen"
}
fn evaluate(&self, bundle: &SynthesisBundle) -> InvariantStatus {
let Some(seq) = dna_sequence(bundle) else {
return InvariantStatus::Pass;
};
let upper = seq.to_ascii_uppercase();
let probes: &[(&str, &str)] =
&[("BsaI", "GGTCTC"), ("BbsI", "GAAGAC"), ("SapI", "GCTCTTC")];
let len = upper.len();
for (name, site) in probes {
let n = site.len();
if len < n {
continue;
}
let head = &upper[..n.min(len)];
let tail = &upper[len.saturating_sub(n)..];
if head == *site || tail == *site {
return advisory(format!(
"fragment terminus exposes {name} site {site}; possible Golden Gate input"
));
}
}
InvariantStatus::Pass
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::bundle::{BundleAuthority, SynthesisPayload};
use chrono::Utc;
fn dna(seq: &str) -> SynthesisBundle {
SynthesisBundle {
timestamp: Utc::now(),
source: "t".into(),
sequence: 0,
payload: SynthesisPayload::Dna {
sequence: seq.into(),
},
delta_time: 0.0,
authority: BundleAuthority {
pca_chain: String::new(),
required_ops: vec![],
},
metadata: Default::default(),
}
}
use crate::models::profile::BioProfile;
use crate::screening::{HazardEntry, HazardHit};
fn profile() -> BioProfile {
BioProfile {
name: "t".into(),
version: "0.1.0".into(),
bsl_level: 2,
allowed_substrates: vec!["dna".into()],
max_synthesis_volume_ml: 1.0,
export_controlled: false,
profile_signature: None,
profile_signer_kid: None,
codon_usage_organism: None,
codon_entropy_band: None,
protein_kmer_k: None,
protein_kmer_threshold: None,
allowed_protocol_steps: None,
allow_stale_screening: false,
stale_screening_max_days: None,
max_authority_chain_depth: 5,
max_dna_length_bp: None,
max_peptide_length_aa: None,
max_smiles_length_chars: None,
}
}
fn ctx<'a>(hits: &'a [HazardHit], prof: &'a BioProfile) -> InvariantContext<'a> {
InvariantContext {
screening_hits: hits,
profile: prof,
}
}
fn hit(class: &str, id: &str) -> HazardHit {
HazardHit {
entry: HazardEntry {
id: id.into(),
label: id.into(),
hazard_class: class.into(),
pattern: ".*".into(),
},
matched_text: "MATCH".into(),
}
}
#[test]
fn d1_pass_when_no_select_agent_hit() {
let p = profile();
let s = SelectAgentScreen;
assert!(matches!(
s.evaluate_with(&dna("ATGAAA"), &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d1_fail_when_select_agent_hit() {
let p = profile();
let h = vec![hit("select-agent", "sap-1")];
let s = SelectAgentScreen;
assert!(matches!(
s.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d1_unrelated_hits_ignored() {
let p = profile();
let h = vec![hit("antimicrobial", "amp-1")];
let s = SelectAgentScreen;
assert!(matches!(
s.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Pass
));
}
#[test]
fn d2_pass_when_no_pandemic_hit() {
let p = profile();
let s = PandemicPathogenScreen;
assert!(matches!(
s.evaluate_with(&dna("ATGAAA"), &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d2_fail_on_pandemic_hit() {
let p = profile();
let h = vec![hit("pandemic-pathogen", "pheic-1")];
let s = PandemicPathogenScreen;
assert!(matches!(
s.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d2_passes_for_non_dna_payload() {
let p = profile();
let bundle = SynthesisBundle {
timestamp: Utc::now(),
source: "t".into(),
sequence: 0,
payload: SynthesisPayload::Peptide {
sequence: "AAAA".into(),
},
delta_time: 0.0,
authority: BundleAuthority {
pca_chain: "".into(),
required_ops: vec![],
},
metadata: Default::default(),
};
assert!(matches!(
PandemicPathogenScreen.evaluate_with(&bundle, &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d3_pass_clean() {
let p = profile();
assert!(matches!(
ToxinGeneScreen.evaluate_with(&dna("ATGAAA"), &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d3_fail_on_toxin_hit() {
let p = profile();
let h = vec![hit("toxin", "tox-1")];
assert!(matches!(
ToxinGeneScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d3_unrelated_hits_ignored() {
let p = profile();
let h = vec![hit("synbio-part", "ig-1")];
assert!(matches!(
ToxinGeneScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Pass
));
}
#[test]
fn d4_pass_clean() {
let p = profile();
assert!(matches!(
VirulenceFactorScreen.evaluate_with(&dna("ATGAAA"), &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d4_advisory_on_virulence_hit() {
let p = profile();
let h = vec![hit("virulence-factor", "vf-1")];
assert!(matches!(
VirulenceFactorScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Advisory { .. }
));
}
#[test]
fn d4_unrelated_ignored() {
let p = profile();
let h = vec![hit("toxin", "tox-1")];
assert!(matches!(
VirulenceFactorScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Pass
));
}
#[test]
fn d5_pass_clean() {
let p = profile();
assert!(matches!(
AntibioticResistanceScreen.evaluate_with(&dna("ATGAAA"), &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d5_fail_on_amr_hit() {
let p = profile();
let h = vec![hit("antibiotic-resistance", "card-1")];
assert!(matches!(
AntibioticResistanceScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d5_amr_alias_card_class() {
let p = profile();
let h = vec![hit("CARD", "x")];
assert!(matches!(
AntibioticResistanceScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d6_pass_clean() {
let p = profile();
assert!(matches!(
SynbioPartScreen.evaluate_with(&dna("ATGAAA"), &ctx(&[], &p)),
InvariantStatus::Pass
));
}
#[test]
fn d6_advisory_on_part_hit() {
let p = profile();
let h = vec![hit("synbio-part", "BBa_001")];
assert!(matches!(
SynbioPartScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Advisory { .. }
));
}
#[test]
fn d6_unrelated_ignored() {
let p = profile();
let h = vec![hit("toxin", "tox-1")];
assert!(matches!(
SynbioPartScreen.evaluate_with(&dna("ATGAAA"), &ctx(&h, &p)),
InvariantStatus::Pass
));
}
#[test]
fn d7_short_sequence_passes() {
assert!(matches!(
CodonEntropyScreen.evaluate(&dna("ATGAAATTT")),
InvariantStatus::Pass
));
}
#[test]
fn d7_diverse_codons_pass() {
let s = "ATGAAACCCGGGTTTCATCAGAATGAAGAAACTGGTGGT";
assert!(matches!(
CodonEntropyScreen.evaluate(&dna(s)),
InvariantStatus::Pass
));
}
#[test]
fn d7_homopolymer_codons_advisory() {
let s = "AAA".repeat(12);
assert!(matches!(
CodonEntropyScreen.evaluate(&dna(&s)),
InvariantStatus::Advisory { .. }
));
}
#[test]
fn d7_explicit_band_overrides_default() {
let mut p = profile();
p.codon_entropy_band = Some((4.0, 5.0));
let seq = "ATGAAACCCGGGTTTCATCAGAATGAAGAAACTGGTGGT";
let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(seq), &c);
assert!(matches!(status, InvariantStatus::Advisory { .. }));
}
#[test]
fn d7_organism_lookup_used_when_no_explicit_band() {
let mut p = profile();
p.codon_usage_organism = Some("h_sapiens".into());
let seq = "ATGAAACCCGGGTTTCATCAGAATGAAGAAACTGGTGGT";
let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(seq), &c);
assert!(matches!(status, InvariantStatus::Pass));
}
#[test]
fn d7_default_fallback_when_no_profile_override() {
let p = profile();
let seq = "ATGAAACCCGGGTTTCATCAGAATGAAGAAACTGGTGGT";
let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(seq), &c);
assert!(matches!(status, InvariantStatus::Pass));
}
#[test]
fn d7_invalid_organism_fails_profile_validation() {
use crate::models::error::Validate;
let mut p = profile();
p.codon_usage_organism = Some("unknown_organism".into());
assert!(p.validate().is_err());
}
#[test]
fn d7_chi_squared_uniform_codons_flags_deviation() {
let mut p = profile();
p.codon_usage_organism = Some("e_coli".into());
let seq = "AAA".repeat(50); let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(&seq), &c);
assert!(matches!(status, InvariantStatus::Advisory { .. }));
}
#[test]
fn d7_chi_squared_ecoli_like_sequence_passes() {
let mut p = profile();
p.codon_usage_organism = Some("e_coli".into());
let codons = [
"CTG", "GAA", "GAT", "ATG", "GCG", "AAA", "GGC", "ACC", "GAC", "ATT", "AAC", "GCA",
"TTC", "TAT", "CAG", "GGT", "AGC", "GTG", "GCC", "CTG", "GAA", "GAT", "ATG", "GCG",
"AAA", "GGC", "ACC", "GAC", "ATT", "AAC", "TTC", "TAT", "CAG", "GGT", "AGC", "GTG",
"TCT", "CCG", "CGT", "AAT",
];
let seq: String = codons.join("");
let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(&seq), &c);
assert!(
matches!(status, InvariantStatus::Pass),
"E. coli-like sequence should pass D7: got {status:?}"
);
}
#[test]
fn d7_cutg_table_lookup_returns_some_for_known_organisms() {
assert!(super::cutg_table("e_coli").is_some());
assert!(super::cutg_table("s_cerevisiae").is_some());
assert!(super::cutg_table("h_sapiens").is_some());
assert!(super::cutg_table("cho_k1").is_some());
assert!(super::cutg_table("unknown").is_none());
}
#[test]
fn d7_codon_to_index_round_trip() {
assert_eq!(super::codon_to_index(b"AAA"), 0);
assert_eq!(super::codon_to_index(b"TTT"), 63);
assert_eq!(super::codon_to_index(b"ATG"), 3 * 4 + 2);
}
#[test]
fn d7_chi2_survival_reasonable() {
let p = super::chi2_survival_approx(60.0, 60);
assert!(p > 0.3 && p < 0.7, "p={p}");
let p2 = super::chi2_survival_approx(200.0, 60);
assert!(p2 < 0.001, "p2={p2}");
}
#[test]
fn d7_protein_kmer_k_profile_validation() {
use crate::models::error::Validate;
let mut p = profile();
p.protein_kmer_k = Some(2);
assert!(p.validate().is_err());
p.protein_kmer_k = Some(5);
assert!(p.validate().is_ok());
p.protein_kmer_k = Some(9);
assert!(p.validate().is_err());
}
#[test]
fn d7_protein_kmer_threshold_profile_validation() {
use crate::models::error::Validate;
let mut p = profile();
p.protein_kmer_threshold = Some(0.0);
assert!(p.validate().is_err());
p.protein_kmer_threshold = Some(0.5);
assert!(p.validate().is_ok());
p.protein_kmer_threshold = Some(1.5);
assert!(p.validate().is_err());
}
#[test]
fn d8_normal_gc_passes() {
let s = "ATGCATGCATGCATGCATGC";
assert!(matches!(
GcContentScreen.evaluate(&dna(s)),
InvariantStatus::Pass
));
}
#[test]
fn d8_low_gc_window_fails() {
let s = "A".repeat(100);
assert!(matches!(
GcContentScreen.evaluate(&dna(&s)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d8_high_gc_window_fails() {
let s = "G".repeat(100);
assert!(matches!(
GcContentScreen.evaluate(&dna(&s)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d9_short_passes() {
assert!(matches!(
SecondaryStructureScreen.evaluate(&dna("ATGCGT")),
InvariantStatus::Pass
));
}
#[test]
fn d9_palindromic_hairpin_fails() {
let head = "AAAACCCCGGGGTTTTACGT";
let tail: String = head
.chars()
.rev()
.map(|c| match c {
'A' => 'T',
'T' => 'A',
'C' => 'G',
'G' => 'C',
_ => 'N',
})
.collect();
let seq = format!("{head}AAAAA{tail}");
assert!(matches!(
SecondaryStructureScreen.evaluate(&dna(&seq)),
InvariantStatus::Fail { .. }
));
}
#[test]
fn d9_random_sequence_passes() {
let s = "ATGAAAGCTGGCGTTTTTTGCCTGCCAATAGTAGCTGAGCTAGCTAGCTGAGCTGAATCAGT";
assert!(matches!(
SecondaryStructureScreen.evaluate(&dna(s)),
InvariantStatus::Pass
));
}
#[test]
fn d10_clean_passes() {
assert!(matches!(
AssemblyCompatibilityScreen.evaluate(&dna("ATGAAACCCGGGTTT")),
InvariantStatus::Pass
));
}
#[test]
fn d10_bsai_terminus_advisory() {
let s = "GGTCTCAAAACCCC";
assert!(matches!(
AssemblyCompatibilityScreen.evaluate(&dna(s)),
InvariantStatus::Advisory { .. }
));
}
#[test]
fn d10_bbsi_tail_advisory() {
let s = "AAAACCCCGAAGAC";
assert!(matches!(
AssemblyCompatibilityScreen.evaluate(&dna(s)),
InvariantStatus::Advisory { .. }
));
}
#[test]
fn safe_bundle_passes_all_d_invariants_with_no_hits() {
let p = profile();
let bundle = dna("ATGAAAGCTGGCGTTTTTTGCCTG");
let c = ctx(&[], &p);
for s in [
SelectAgentScreen.evaluate_with(&bundle, &c),
PandemicPathogenScreen.evaluate_with(&bundle, &c),
ToxinGeneScreen.evaluate_with(&bundle, &c),
VirulenceFactorScreen.evaluate_with(&bundle, &c),
AntibioticResistanceScreen.evaluate_with(&bundle, &c),
SynbioPartScreen.evaluate_with(&bundle, &c),
] {
assert!(matches!(s, InvariantStatus::Pass), "got {:?}", s);
}
}
#[test]
fn translate_dna_only_runs_on_dna_bundles() {
let res = translate_dna(&dna("ATGCGT")).expect("dna bundle");
assert!(res.is_ok());
}
#[test]
fn translate_known_control_sequence() {
let frames = translate_dna_sequence("ATGAAATAA").unwrap();
assert_eq!(frames.frame1, "MK*");
}
#[test]
fn translate_yields_three_frames() {
let frames = translate_dna_sequence("ATGAAATAA").unwrap();
assert_eq!(frames.frame1.len(), 3);
assert_eq!(frames.frame2.len(), 2);
assert_eq!(frames.frame3.len(), 2);
}
#[test]
fn translate_lowercase_is_normalized() {
let upper = translate_dna_sequence("ATGAAATAA").unwrap();
let lower = translate_dna_sequence("atgaaataa").unwrap();
assert_eq!(upper, lower);
}
#[test]
fn translate_ambiguous_bases_become_x() {
let frames = translate_dna_sequence("ATGNNNTAA").unwrap();
assert_eq!(frames.frame1, "MX*");
}
#[test]
fn translate_drops_trailing_partial_codon() {
let frames = translate_dna_sequence("ATGAAAT").unwrap();
assert_eq!(frames.frame1, "MK");
}
#[test]
fn translate_empty_input() {
let frames = translate_dna_sequence("").unwrap();
assert!(frames.frame1.is_empty());
assert!(frames.frame2.is_empty());
assert!(frames.frame3.is_empty());
}
#[test]
fn translate_rejects_non_acgtn() {
let err = translate_dna_sequence("ATGZAA").unwrap_err();
assert_eq!(
err,
TranslateError::InvalidBase {
base: 'Z',
offset: 3
}
);
}
#[test]
fn translate_rejects_whitespace() {
let err = translate_dna_sequence("ATG AAA").unwrap_err();
assert!(matches!(err, TranslateError::InvalidBase { .. }));
}
#[test]
fn translate_frame_shift_changes_protein() {
let frames = translate_dna_sequence("ATGAAA").unwrap();
assert_eq!(frames.frame1, "MK");
assert_eq!(frames.frame2, "*");
assert_eq!(frames.frame3, "E");
}
#[test]
fn translate_frames_iter_in_order() {
let frames = translate_dna_sequence("ATGAAATAA").unwrap();
let collected: Vec<&str> = frames.iter().collect();
assert_eq!(
collected,
vec!["MK*", frames.frame2.as_str(), frames.frame3.as_str()]
);
}
#[test]
fn translate_all_stop_codons() {
assert_eq!(translate_dna_sequence("TAA").unwrap().frame1, "*");
assert_eq!(translate_dna_sequence("TAG").unwrap().frame1, "*");
assert_eq!(translate_dna_sequence("TGA").unwrap().frame1, "*");
}
#[test]
fn protein_kmers_extracts_correct_count() {
use super::super::homology::KmerHomologyEngine;
let kmers = KmerHomologyEngine::protein_kmers("MKFAL", 3);
assert_eq!(kmers.len(), 3);
}
#[test]
fn protein_kmers_skips_unknown_residues() {
use super::super::homology::KmerHomologyEngine;
let kmers = KmerHomologyEngine::protein_kmers("MKXAL", 3);
assert_eq!(kmers.len(), 0);
}
#[test]
fn protein_kmers_empty_for_short_seq() {
use super::super::homology::KmerHomologyEngine;
let kmers = KmerHomologyEngine::protein_kmers("MK", 3);
assert!(kmers.is_empty());
}
#[test]
fn kmer_jaccard_identical_is_one() {
use super::super::homology::KmerHomologyEngine;
let a = KmerHomologyEngine::protein_kmers("MKFAL", 3);
let sim = KmerHomologyEngine::jaccard(&a, &a);
assert!((sim - 1.0).abs() < 1e-10);
}
#[test]
fn kmer_jaccard_disjoint_is_zero() {
use super::super::homology::KmerHomologyEngine;
let a = KmerHomologyEngine::protein_kmers("MKFAL", 3);
let b = KmerHomologyEngine::protein_kmers("DDDDD", 3);
let sim = KmerHomologyEngine::jaccard(&a, &b);
assert!((sim - 0.0).abs() < 1e-10);
}
#[test]
fn combined_screen_includes_regex_hits() {
let p = profile();
let h = vec![hit("select-agent", "sap-1")];
let bundle = dna("ATGAAA");
let c = ctx(&h, &p);
let hits = super::combined_screen(&bundle, &c, SELECT_AGENT_CLASSES);
assert!(!hits.is_empty());
assert!(hits.iter().any(|h| h.contains("sap-1")));
}
#[test]
fn combined_screen_no_hits_for_clean_bundle() {
let p = profile();
let bundle = dna("ATGAAA");
let c = ctx(&[], &p);
let hits = super::combined_screen(&bundle, &c, SELECT_AGENT_CLASSES);
assert!(hits.is_empty());
}
#[test]
fn protein_rescreen_detects_codon_substituted_homolog() {
let p = profile();
let h = vec![HazardHit {
entry: HazardEntry {
id: "sap-codon-sub".into(),
label: "codon-sub test".into(),
hazard_class: "select-agent".into(),
pattern: ".*".into(),
},
matched_text: "ATGAAAGCG".into(), }];
let bundle = dna("ATGAAGGCC"); let c = ctx(&h, &p);
let protein_hits = super::protein_space_rescreen(&bundle, &c, SELECT_AGENT_CLASSES);
assert!(protein_hits.is_empty());
let long_dna = "ATGAAAGCGTTCCTGATTGATAACGAAGCC"; let h2 = vec![HazardHit {
entry: HazardEntry {
id: "sap-long".into(),
label: "long test".into(),
hazard_class: "select-agent".into(),
pattern: ".*".into(),
},
matched_text: "ATGAAGGCCTTTTTGATTGACAATGAGGCC".into(),
}];
let bundle2 = dna(long_dna);
let c2 = ctx(&h2, &p);
let protein_hits2 = super::protein_space_rescreen(&bundle2, &c2, SELECT_AGENT_CLASSES);
assert!(
!protein_hits2.is_empty(),
"protein-space rescreen should detect synonymous-codon homolog"
);
}
#[test]
fn d1_detects_protein_level_hit() {
let p = profile();
let long_dna = "ATGAAAGCGTTCCTGATTGATAACGAAGCC";
let h = vec![HazardHit {
entry: HazardEntry {
id: "sap-protein".into(),
label: "protein hit".into(),
hazard_class: "select-agent".into(),
pattern: ".*".into(),
},
matched_text: "ATGAAGGCCTTTTTGATTGACAATGAGGCC".into(),
}];
let bundle = dna(long_dna);
let c = ctx(&h, &p);
let status = SelectAgentScreen.evaluate_with(&bundle, &c);
assert!(
matches!(status, InvariantStatus::Fail { .. }),
"D1 should fail on protein-level hit: got {status:?}"
);
}
#[test]
fn d7_chi_squared_yeast_codons_deviate_from_ecoli() {
let mut p = profile();
p.codon_usage_organism = Some("e_coli".into());
let codons = [
"AGA", "AGA", "AGA", "AGA", "AGA", "TTA", "TTA", "TTA", "TTA", "TTA", "ACA", "ACA",
"ACA", "ACA", "ACA", "ATA", "ATA", "ATA", "ATA", "ATA", "GAA", "GAA", "GAA", "GAA",
"GAA", "ATG", "ATG", "ATG", "ATG", "ATG", "CTG", "CTG", "CTG", "CTG", "CTG", "GCG",
"GCG", "GCG", "GCG", "GCG",
];
let seq: String = codons.join("");
let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(&seq), &c);
assert!(
matches!(status, InvariantStatus::Advisory { .. }),
"yeast-biased codons should deviate from E. coli CUTG: got {status:?}"
);
}
#[test]
fn d7_chi_squared_advisory_contains_chi_squared_info() {
let mut p = profile();
p.codon_usage_organism = Some("h_sapiens".into());
let codons: Vec<&str> = std::iter::repeat_n("AGA", 15)
.chain(std::iter::repeat_n("CTG", 15))
.chain(std::iter::repeat_n("GAA", 5))
.chain(std::iter::repeat_n("ATG", 5))
.collect();
let seq: String = codons.join("");
let c = ctx(&[], &p);
let status = CodonEntropyScreen.evaluate_with(&dna(&seq), &c);
if let InvariantStatus::Advisory { note } = &status {
assert!(
note.contains("entropy") || note.contains("χ²") || note.contains("CUTG"),
"advisory should mention entropy or chi-squared: {note}"
);
} else {
panic!("expected Advisory, got {status:?}");
}
}
#[test]
fn protein_rescreen_catches_reverse_complement_homolog() {
let p = profile();
let fwd = "ATGAAAGCGTTCCTGATTGATAACGAAGCC";
let rc_bytes = super::revcomp(fwd.to_ascii_uppercase().as_bytes());
let rc = String::from_utf8(rc_bytes).unwrap();
let h = vec![HazardHit {
entry: HazardEntry {
id: "sap-rc".into(),
label: "reverse complement".into(),
hazard_class: "select-agent".into(),
pattern: ".*".into(),
},
matched_text: fwd.into(),
}];
let bundle = dna(&rc);
let c = ctx(&h, &p);
let hits = super::protein_space_rescreen(&bundle, &c, SELECT_AGENT_CLASSES);
assert!(
!hits.is_empty(),
"should detect protein homology via reverse complement"
);
}
#[test]
fn profile_kmer_k_overrides_default() {
let mut p = profile();
p.protein_kmer_k = Some(3);
let h = vec![HazardHit {
entry: HazardEntry {
id: "sap-k3".into(),
label: "k3 test".into(),
hazard_class: "select-agent".into(),
pattern: ".*".into(),
},
matched_text: "ATGAAAGCGTTCCTG".into(),
}];
let bundle = dna("ATGAAAGCGTTCCTG");
let c = ctx(&h, &p);
let hits = super::protein_space_rescreen(&bundle, &c, SELECT_AGENT_CLASSES);
assert!(
!hits.is_empty(),
"smaller k should produce hits for identical 5-residue proteins"
);
}
}