use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use crate::core::contig::{detect_naming_convention, Contig, SequenceRole};
use crate::core::types::{Assembly, NamingConvention, ReferenceId, ReferenceSource};
use crate::utils::validation::compute_signature as compute_sig;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct KnownReference {
pub id: ReferenceId,
pub display_name: String,
pub assembly: Assembly,
pub source: ReferenceSource,
pub naming_convention: NamingConvention,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub download_url: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub assembly_report_url: Option<String>,
pub contigs: Vec<Contig>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub contigs_missing_from_fasta: Vec<String>,
#[serde(skip)]
pub md5_set: HashSet<String>,
#[serde(skip)]
pub sha512t24u_set: HashSet<String>,
#[serde(skip)]
pub name_length_set: HashSet<(String, u64)>,
#[serde(skip)]
pub signature: Option<String>,
}
impl KnownReference {
pub fn new(
id: impl Into<String>,
display_name: impl Into<String>,
assembly: Assembly,
source: ReferenceSource,
) -> Self {
Self {
id: ReferenceId::new(id),
display_name: display_name.into(),
assembly,
source,
naming_convention: NamingConvention::Mixed,
download_url: None,
assembly_report_url: None,
contigs: Vec::new(),
description: None,
tags: Vec::new(),
contigs_missing_from_fasta: Vec::new(),
md5_set: HashSet::new(),
sha512t24u_set: HashSet::new(),
name_length_set: HashSet::new(),
signature: None,
}
}
#[must_use]
pub fn with_contigs(mut self, contigs: Vec<Contig>) -> Self {
self.naming_convention = detect_naming_convention(&contigs);
self.contigs = contigs;
self.rebuild_indexes();
self
}
pub fn rebuild_indexes(&mut self) {
self.md5_set.clear();
self.sha512t24u_set.clear();
self.name_length_set.clear();
for contig in &self.contigs {
if let Some(md5) = &contig.md5 {
self.md5_set.insert(md5.clone());
}
if let Some(digest) = &contig.sha512t24u {
self.sha512t24u_set.insert(digest.clone());
}
self.name_length_set
.insert((contig.name.clone(), contig.length));
for alias in &contig.aliases {
self.name_length_set.insert((alias.clone(), contig.length));
}
}
self.signature = self.compute_signature();
}
fn compute_signature(&self) -> Option<String> {
let sig = compute_sig(&self.md5_set);
if sig.is_empty() {
None
} else {
Some(sig)
}
}
#[must_use]
pub fn has_decoy(&self) -> bool {
self.contigs.iter().any(super::contig::Contig::is_decoy)
}
#[must_use]
pub fn has_alt(&self) -> bool {
self.contigs.iter().any(super::contig::Contig::is_alt)
}
#[must_use]
pub fn role_counts(&self) -> RoleCounts {
let mut counts = RoleCounts::default();
for contig in &self.contigs {
match contig.sequence_role {
SequenceRole::AssembledMolecule => counts.assembled_molecule += 1,
SequenceRole::AltScaffold => counts.alt_scaffold += 1,
SequenceRole::FixPatch => counts.fix_patch += 1,
SequenceRole::NovelPatch => counts.novel_patch += 1,
SequenceRole::UnlocalizedScaffold => counts.unlocalized_scaffold += 1,
SequenceRole::UnplacedScaffold => counts.unplaced_scaffold += 1,
SequenceRole::Unknown => counts.unknown += 1,
}
}
counts
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct RoleCounts {
pub assembled_molecule: usize,
pub alt_scaffold: usize,
pub fix_patch: usize,
pub novel_patch: usize,
pub unlocalized_scaffold: usize,
pub unplaced_scaffold: usize,
pub unknown: usize,
}