Skip to main content

ref_solver/core/
reference.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashSet;
3
4use crate::core::contig::{detect_naming_convention, Contig, SequenceRole};
5use crate::core::types::{Assembly, NamingConvention, ReferenceId, ReferenceSource};
6use crate::utils::validation::compute_signature as compute_sig;
7
8/// A known reference genome in the catalog
9#[derive(Debug, Clone, Serialize, Deserialize)]
10#[non_exhaustive]
11pub struct KnownReference {
12    /// Unique identifier
13    pub id: ReferenceId,
14
15    /// Human-readable display name
16    pub display_name: String,
17
18    /// Assembly version
19    pub assembly: Assembly,
20
21    /// Source organization
22    pub source: ReferenceSource,
23
24    /// Naming convention used
25    pub naming_convention: NamingConvention,
26
27    /// Download URL for the reference FASTA
28    #[serde(default, skip_serializing_if = "Option::is_none")]
29    pub download_url: Option<String>,
30
31    /// Path to NCBI assembly report (if applicable)
32    #[serde(default, skip_serializing_if = "Option::is_none")]
33    pub assembly_report_url: Option<String>,
34
35    /// All contigs in this reference
36    pub contigs: Vec<Contig>,
37
38    /// Description/notes about this reference
39    #[serde(default, skip_serializing_if = "Option::is_none")]
40    pub description: Option<String>,
41
42    /// Tags for filtering (e.g., "`with_decoy`", "`no_alt`", "`analysis_set`")
43    #[serde(default, skip_serializing_if = "Vec::is_empty")]
44    pub tags: Vec<String>,
45
46    /// Contigs that appear in the assembly report but not in the FASTA/dict
47    /// (e.g., MT in CHM13 which uses standard rCRS mitochondria)
48    #[serde(default, skip_serializing_if = "Vec::is_empty")]
49    pub contigs_missing_from_fasta: Vec<String>,
50
51    // === Pre-computed for fast matching (populated on load) ===
52    /// Set of all MD5 checksums in this reference
53    #[serde(skip)]
54    pub md5_set: HashSet<String>,
55
56    /// Set of all sha512t24u digests in this reference
57    #[serde(skip)]
58    pub sha512t24u_set: HashSet<String>,
59
60    /// Set of all (`exact_name`, length) pairs for matching
61    #[serde(skip)]
62    pub name_length_set: HashSet<(String, u64)>,
63
64    /// Signature for exact matching (hash of sorted MD5s)
65    #[serde(skip)]
66    pub signature: Option<String>,
67}
68
69impl KnownReference {
70    pub fn new(
71        id: impl Into<String>,
72        display_name: impl Into<String>,
73        assembly: Assembly,
74        source: ReferenceSource,
75    ) -> Self {
76        Self {
77            id: ReferenceId::new(id),
78            display_name: display_name.into(),
79            assembly,
80            source,
81            naming_convention: NamingConvention::Mixed,
82            download_url: None,
83            assembly_report_url: None,
84            contigs: Vec::new(),
85            description: None,
86            tags: Vec::new(),
87            contigs_missing_from_fasta: Vec::new(),
88            md5_set: HashSet::new(),
89            sha512t24u_set: HashSet::new(),
90            name_length_set: HashSet::new(),
91            signature: None,
92        }
93    }
94
95    #[must_use]
96    pub fn with_contigs(mut self, contigs: Vec<Contig>) -> Self {
97        self.naming_convention = detect_naming_convention(&contigs);
98        self.contigs = contigs;
99        self.rebuild_indexes();
100        self
101    }
102
103    /// Rebuild the internal indexes after modifying contigs
104    pub fn rebuild_indexes(&mut self) {
105        self.md5_set.clear();
106        self.sha512t24u_set.clear();
107        self.name_length_set.clear();
108
109        for contig in &self.contigs {
110            if let Some(md5) = &contig.md5 {
111                self.md5_set.insert(md5.clone());
112            }
113            if let Some(digest) = &contig.sha512t24u {
114                self.sha512t24u_set.insert(digest.clone());
115            }
116            // Use exact name for matching (no normalization)
117            self.name_length_set
118                .insert((contig.name.clone(), contig.length));
119
120            // Also add aliases to name_length_set for matching
121            for alias in &contig.aliases {
122                self.name_length_set.insert((alias.clone(), contig.length));
123            }
124        }
125
126        // Compute signature from sorted MD5s
127        self.signature = self.compute_signature();
128    }
129
130    /// Compute a signature for exact matching
131    /// Uses sorted MD5s concatenated and hashed
132    fn compute_signature(&self) -> Option<String> {
133        let sig = compute_sig(&self.md5_set);
134        if sig.is_empty() {
135            None
136        } else {
137            Some(sig)
138        }
139    }
140
141    /// Check if this reference has decoy sequences
142    #[must_use]
143    pub fn has_decoy(&self) -> bool {
144        self.contigs.iter().any(super::contig::Contig::is_decoy)
145    }
146
147    /// Check if this reference has ALT contigs
148    #[must_use]
149    pub fn has_alt(&self) -> bool {
150        self.contigs.iter().any(super::contig::Contig::is_alt)
151    }
152
153    /// Count contigs by sequence role
154    #[must_use]
155    pub fn role_counts(&self) -> RoleCounts {
156        let mut counts = RoleCounts::default();
157        for contig in &self.contigs {
158            match contig.sequence_role {
159                SequenceRole::AssembledMolecule => counts.assembled_molecule += 1,
160                SequenceRole::AltScaffold => counts.alt_scaffold += 1,
161                SequenceRole::FixPatch => counts.fix_patch += 1,
162                SequenceRole::NovelPatch => counts.novel_patch += 1,
163                SequenceRole::UnlocalizedScaffold => counts.unlocalized_scaffold += 1,
164                SequenceRole::UnplacedScaffold => counts.unplaced_scaffold += 1,
165                SequenceRole::Unknown => counts.unknown += 1,
166            }
167        }
168        counts
169    }
170}
171
172/// Counts of contigs by sequence role
173#[derive(Debug, Clone, Default, Serialize, Deserialize)]
174pub struct RoleCounts {
175    pub assembled_molecule: usize,
176    pub alt_scaffold: usize,
177    pub fix_patch: usize,
178    pub novel_patch: usize,
179    pub unlocalized_scaffold: usize,
180    pub unplaced_scaffold: usize,
181    pub unknown: usize,
182}