Skip to main content

ref_solver/core/
reference.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashSet;
3
4use crate::core::contig::{detect_naming_convention, Contig, SequenceRole};
5use crate::core::types::{Assembly, NamingConvention, ReferenceId, ReferenceSource};
6use crate::utils::validation::compute_signature as compute_sig;
7
8/// A known reference genome in the catalog
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct KnownReference {
11    /// Unique identifier
12    pub id: ReferenceId,
13
14    /// Human-readable display name
15    pub display_name: String,
16
17    /// Assembly version
18    pub assembly: Assembly,
19
20    /// Source organization
21    pub source: ReferenceSource,
22
23    /// Naming convention used
24    pub naming_convention: NamingConvention,
25
26    /// Download URL for the reference FASTA
27    #[serde(default, skip_serializing_if = "Option::is_none")]
28    pub download_url: Option<String>,
29
30    /// Path to NCBI assembly report (if applicable)
31    #[serde(default, skip_serializing_if = "Option::is_none")]
32    pub assembly_report_url: Option<String>,
33
34    /// All contigs in this reference
35    pub contigs: Vec<Contig>,
36
37    /// Description/notes about this reference
38    #[serde(default, skip_serializing_if = "Option::is_none")]
39    pub description: Option<String>,
40
41    /// Tags for filtering (e.g., "`with_decoy`", "`no_alt`", "`analysis_set`")
42    #[serde(default, skip_serializing_if = "Vec::is_empty")]
43    pub tags: Vec<String>,
44
45    /// Contigs that appear in the assembly report but not in the FASTA/dict
46    /// (e.g., MT in CHM13 which uses standard rCRS mitochondria)
47    #[serde(default, skip_serializing_if = "Vec::is_empty")]
48    pub contigs_missing_from_fasta: Vec<String>,
49
50    // === Pre-computed for fast matching (populated on load) ===
51    /// Set of all MD5 checksums in this reference
52    #[serde(skip)]
53    pub md5_set: HashSet<String>,
54
55    /// Set of all (`exact_name`, length) pairs for matching
56    #[serde(skip)]
57    pub name_length_set: HashSet<(String, u64)>,
58
59    /// Signature for exact matching (hash of sorted MD5s)
60    #[serde(skip)]
61    pub signature: Option<String>,
62}
63
64impl KnownReference {
65    pub fn new(
66        id: impl Into<String>,
67        display_name: impl Into<String>,
68        assembly: Assembly,
69        source: ReferenceSource,
70    ) -> Self {
71        Self {
72            id: ReferenceId::new(id),
73            display_name: display_name.into(),
74            assembly,
75            source,
76            naming_convention: NamingConvention::Mixed,
77            download_url: None,
78            assembly_report_url: None,
79            contigs: Vec::new(),
80            description: None,
81            tags: Vec::new(),
82            contigs_missing_from_fasta: Vec::new(),
83            md5_set: HashSet::new(),
84            name_length_set: HashSet::new(),
85            signature: None,
86        }
87    }
88
89    #[must_use]
90    pub fn with_contigs(mut self, contigs: Vec<Contig>) -> Self {
91        self.naming_convention = detect_naming_convention(&contigs);
92        self.contigs = contigs;
93        self.rebuild_indexes();
94        self
95    }
96
97    /// Rebuild the internal indexes after modifying contigs
98    pub fn rebuild_indexes(&mut self) {
99        self.md5_set.clear();
100        self.name_length_set.clear();
101
102        for contig in &self.contigs {
103            if let Some(md5) = &contig.md5 {
104                self.md5_set.insert(md5.clone());
105            }
106            // Use exact name for matching (no normalization)
107            self.name_length_set
108                .insert((contig.name.clone(), contig.length));
109
110            // Also add aliases to name_length_set for matching
111            for alias in &contig.aliases {
112                self.name_length_set.insert((alias.clone(), contig.length));
113            }
114        }
115
116        // Compute signature from sorted MD5s
117        self.signature = self.compute_signature();
118    }
119
120    /// Compute a signature for exact matching
121    /// Uses sorted MD5s concatenated and hashed
122    fn compute_signature(&self) -> Option<String> {
123        let sig = compute_sig(&self.md5_set);
124        if sig.is_empty() {
125            None
126        } else {
127            Some(sig)
128        }
129    }
130
131    /// Check if this reference has decoy sequences
132    #[must_use]
133    pub fn has_decoy(&self) -> bool {
134        self.contigs.iter().any(super::contig::Contig::is_decoy)
135    }
136
137    /// Check if this reference has ALT contigs
138    #[must_use]
139    pub fn has_alt(&self) -> bool {
140        self.contigs.iter().any(super::contig::Contig::is_alt)
141    }
142
143    /// Count contigs by sequence role
144    #[must_use]
145    pub fn role_counts(&self) -> RoleCounts {
146        let mut counts = RoleCounts::default();
147        for contig in &self.contigs {
148            match contig.sequence_role {
149                SequenceRole::AssembledMolecule => counts.assembled_molecule += 1,
150                SequenceRole::AltScaffold => counts.alt_scaffold += 1,
151                SequenceRole::FixPatch => counts.fix_patch += 1,
152                SequenceRole::NovelPatch => counts.novel_patch += 1,
153                SequenceRole::UnlocalizedScaffold => counts.unlocalized_scaffold += 1,
154                SequenceRole::UnplacedScaffold => counts.unplaced_scaffold += 1,
155                SequenceRole::Unknown => counts.unknown += 1,
156            }
157        }
158        counts
159    }
160}
161
162/// Counts of contigs by sequence role
163#[derive(Debug, Clone, Default, Serialize, Deserialize)]
164pub struct RoleCounts {
165    pub assembled_molecule: usize,
166    pub alt_scaffold: usize,
167    pub fix_patch: usize,
168    pub novel_patch: usize,
169    pub unlocalized_scaffold: usize,
170    pub unplaced_scaffold: usize,
171    pub unknown: usize,
172}