use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use crate::core::contig::SequenceRole;
use crate::core::types::ReferenceSource;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct HierarchicalAssembly {
pub id: String,
pub name: String,
pub organism: String,
pub versions: Vec<AssemblyVersion>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct AssemblyVersion {
pub id: String,
pub version: String,
pub source: ReportSource,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub report_contigs: Vec<ReportContig>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub fasta_distributions: Vec<FastaDistribution>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ReportSource {
Ncbi {
accession: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
url: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
date: Option<String>,
},
DerivedFromFasta {
source_files: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
base_assembly: Option<String>,
},
Manual {
#[serde(default, skip_serializing_if = "Option::is_none")]
curator: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
notes: Option<String>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ReportContig {
pub id: u32,
pub sequence_name: String,
pub length: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub md5: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub refseq_accn: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub genbank_accn: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ucsc_name: Option<String>,
#[serde(default)]
pub sequence_role: SequenceRole,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub assigned_molecule: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FastaDistribution {
pub id: String,
pub display_name: String,
pub source: ReferenceSource,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub download_url: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub tags: Vec<String>,
pub contigs: Vec<FastaContig>,
}
impl FastaDistribution {
#[must_use]
pub fn presence_counts(&self) -> PresenceCounts {
let mut counts = PresenceCounts::default();
for contig in &self.contigs {
if contig.report_contig_id.is_some() {
counts.in_both += 1;
} else {
counts.fasta_only += 1;
}
}
counts
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FastaContig {
pub name: String,
pub length: u64,
pub md5: String,
pub sort_order: u32,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub report_contig_id: Option<u32>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub aliases: Vec<String>,
}
impl FastaContig {
#[cfg(test)]
pub fn new(name: impl Into<String>, length: u64, md5: impl Into<String>) -> Self {
Self {
name: name.into(),
length,
md5: md5.into(),
sort_order: 0,
report_contig_id: None,
aliases: Vec::new(),
}
}
pub fn merge(&mut self, other: &FastaContig) -> Result<(), ContigMergeError> {
if self.name != other.name {
return Err(ContigMergeError::NameMismatch {
expected: self.name.clone(),
found: other.name.clone(),
});
}
if self.length != other.length {
return Err(ContigMergeError::LengthMismatch {
name: self.name.clone(),
expected: self.length,
found: other.length,
});
}
if !other.md5.is_empty() {
if self.md5.is_empty() {
self.md5.clone_from(&other.md5);
} else if self.md5 != other.md5 {
return Err(ContigMergeError::Md5Conflict {
name: self.name.clone(),
existing: self.md5.clone(),
incoming: other.md5.clone(),
});
}
}
let existing: HashSet<_> = self.aliases.iter().cloned().collect();
for alias in &other.aliases {
if !existing.contains(alias) && alias != &self.name {
self.aliases.push(alias.clone());
}
}
if self.report_contig_id.is_none() {
self.report_contig_id = other.report_contig_id;
}
Ok(())
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct PresenceCounts {
pub in_both: usize,
pub report_only: usize,
pub fasta_only: usize,
}
#[derive(Debug, Clone, PartialEq)]
pub enum ContigMergeError {
NameMismatch {
expected: String,
found: String,
},
LengthMismatch {
name: String,
expected: u64,
found: u64,
},
Md5Conflict {
name: String,
existing: String,
incoming: String,
},
}
impl std::fmt::Display for ContigMergeError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NameMismatch { expected, found } => {
write!(
f,
"Contig name mismatch: expected '{expected}', found '{found}'"
)
}
Self::LengthMismatch {
name,
expected,
found,
} => {
write!(
f,
"Length mismatch for '{name}': expected {expected}, found {found}"
)
}
Self::Md5Conflict {
name,
existing,
incoming,
} => {
write!(
f,
"MD5 conflict for '{name}': existing={existing}, incoming={incoming}"
)
}
}
}
}
impl std::error::Error for ContigMergeError {}
#[cfg(test)]
mod tests {
use super::*;
fn make_contig(name: &str, length: u64, md5: &str, aliases: Vec<&str>) -> FastaContig {
FastaContig {
name: name.to_string(),
length,
md5: md5.to_string(),
sort_order: 0,
report_contig_id: None,
aliases: aliases.into_iter().map(String::from).collect(),
}
}
#[test]
fn test_merge_adds_md5_when_missing() {
let mut base = make_contig("chr1", 1000, "", vec![]);
let other = make_contig("chr1", 1000, "abc123", vec![]);
base.merge(&other).unwrap();
assert_eq!(base.md5, "abc123");
}
#[test]
fn test_merge_keeps_existing_md5() {
let mut base = make_contig("chr1", 1000, "abc123", vec![]);
let other = make_contig("chr1", 1000, "", vec![]);
base.merge(&other).unwrap();
assert_eq!(base.md5, "abc123");
}
#[test]
fn test_merge_md5_conflict_errors() {
let mut base = make_contig("chr1", 1000, "abc123", vec![]);
let other = make_contig("chr1", 1000, "def456", vec![]);
let result = base.merge(&other);
assert!(matches!(result, Err(ContigMergeError::Md5Conflict { .. })));
}
#[test]
fn test_merge_md5_same_value_ok() {
let mut base = make_contig("chr1", 1000, "abc123", vec![]);
let other = make_contig("chr1", 1000, "abc123", vec![]);
base.merge(&other).unwrap();
assert_eq!(base.md5, "abc123");
}
#[test]
fn test_merge_unions_aliases() {
let mut base = make_contig("chr1", 1000, "", vec!["1", "NC_000001"]);
let other = make_contig("chr1", 1000, "", vec!["NC_000001", "CM000663"]);
base.merge(&other).unwrap();
assert_eq!(base.aliases.len(), 3);
assert!(base.aliases.contains(&"1".to_string()));
assert!(base.aliases.contains(&"NC_000001".to_string()));
assert!(base.aliases.contains(&"CM000663".to_string()));
}
#[test]
fn test_merge_excludes_name_from_aliases() {
let mut base = make_contig("chr1", 1000, "", vec![]);
let other = make_contig("chr1", 1000, "", vec!["chr1", "1"]);
base.merge(&other).unwrap();
assert_eq!(base.aliases, vec!["1"]);
assert!(!base.aliases.contains(&"chr1".to_string()));
}
#[test]
fn test_merge_length_mismatch_errors() {
let mut base = make_contig("chr1", 1000, "", vec![]);
let other = make_contig("chr1", 2000, "", vec![]);
let result = base.merge(&other);
assert!(matches!(
result,
Err(ContigMergeError::LengthMismatch { .. })
));
}
#[test]
fn test_merge_name_mismatch_errors() {
let mut base = make_contig("chr1", 1000, "", vec![]);
let other = make_contig("chr2", 1000, "", vec![]);
let result = base.merge(&other);
assert!(matches!(result, Err(ContigMergeError::NameMismatch { .. })));
}
#[test]
fn test_merge_takes_first_report_contig_id() {
let mut base = make_contig("chr1", 1000, "", vec![]);
base.report_contig_id = Some(42);
let mut other = make_contig("chr1", 1000, "", vec![]);
other.report_contig_id = Some(99);
base.merge(&other).unwrap();
assert_eq!(base.report_contig_id, Some(42));
}
#[test]
fn test_merge_fills_missing_report_contig_id() {
let mut base = make_contig("chr1", 1000, "", vec![]);
let mut other = make_contig("chr1", 1000, "", vec![]);
other.report_contig_id = Some(42);
base.merge(&other).unwrap();
assert_eq!(base.report_contig_id, Some(42));
}
}