Skip to main content

ref_solver/core/
assembly.rs

1//! Hierarchical assembly types for the catalog.
2//!
3//! This module provides types for representing the hierarchy:
4//! Assembly → `AssemblyVersion` → `FastaDistribution` → `FastaContig`
5//!
6//! This allows tracking:
7//! - Multiple FASTA distributions per assembly version (e.g., UCSC hg38 vs 1KG hs38DH)
8//! - Per-contig presence tracking (in report, in FASTA, or both)
9//! - Report provenance (official NCBI vs derived from FASTA)
10
11use serde::{Deserialize, Serialize};
12use std::collections::HashSet;
13
14use crate::core::contig::SequenceRole;
15use crate::core::types::ReferenceSource;
16
17/// Top-level assembly (e.g., `GRCh38`, `GRCh37`, CHM13)
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
19pub struct HierarchicalAssembly {
20    /// Unique identifier (e.g., "grch38")
21    pub id: String,
22    /// Display name (e.g., "`GRCh38`")
23    pub name: String,
24    /// Organism (e.g., "Homo sapiens")
25    pub organism: String,
26    /// All versions/patches of this assembly
27    pub versions: Vec<AssemblyVersion>,
28}
29
30/// A specific version/patch of an assembly (has one assembly report)
31#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
32pub struct AssemblyVersion {
33    /// Unique identifier (e.g., "`grch38_p14`")
34    pub id: String,
35    /// Version string (e.g., "p14")
36    pub version: String,
37    /// Source/provenance of the assembly report
38    pub source: ReportSource,
39    /// Canonical contigs from assembly report
40    #[serde(default, skip_serializing_if = "Vec::is_empty")]
41    pub report_contigs: Vec<ReportContig>,
42    /// FASTA distributions for this version
43    #[serde(default, skip_serializing_if = "Vec::is_empty")]
44    pub fasta_distributions: Vec<FastaDistribution>,
45}
46
47/// Provenance tracking for assembly reports
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49#[serde(tag = "type", rename_all = "snake_case")]
50pub enum ReportSource {
51    /// Official NCBI assembly report
52    Ncbi {
53        /// NCBI accession (e.g., "`GCF_000001405.40`")
54        accession: String,
55        /// Download URL
56        #[serde(default, skip_serializing_if = "Option::is_none")]
57        url: Option<String>,
58        /// Report date
59        #[serde(default, skip_serializing_if = "Option::is_none")]
60        date: Option<String>,
61    },
62    /// Derived from FASTA/dict file (no official report)
63    DerivedFromFasta {
64        /// Source files used to build this
65        source_files: Vec<String>,
66        /// Inferred base assembly (e.g., "`grch38_p14`")
67        #[serde(default, skip_serializing_if = "Option::is_none")]
68        base_assembly: Option<String>,
69    },
70    /// Manually curated
71    Manual {
72        #[serde(default, skip_serializing_if = "Option::is_none")]
73        curator: Option<String>,
74        #[serde(default, skip_serializing_if = "Option::is_none")]
75        notes: Option<String>,
76    },
77}
78
79/// A contig from an assembly report (canonical definition)
80#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
81pub struct ReportContig {
82    /// Internal ID for linking to `FastaContig`
83    pub id: u32,
84    /// Primary sequence name
85    pub sequence_name: String,
86    /// Sequence length
87    pub length: u64,
88    /// MD5 checksum if known
89    #[serde(default, skip_serializing_if = "Option::is_none")]
90    pub md5: Option<String>,
91    /// `RefSeq` accession (e.g., "`NC_000001.11`")
92    #[serde(default, skip_serializing_if = "Option::is_none")]
93    pub refseq_accn: Option<String>,
94    /// `GenBank` accession (e.g., "CM000663.2")
95    #[serde(default, skip_serializing_if = "Option::is_none")]
96    pub genbank_accn: Option<String>,
97    /// UCSC-style name (e.g., "chr1")
98    #[serde(default, skip_serializing_if = "Option::is_none")]
99    pub ucsc_name: Option<String>,
100    /// Sequence role
101    #[serde(default)]
102    pub sequence_role: SequenceRole,
103    /// Assigned molecule (e.g., "1", "X", "MT")
104    #[serde(default, skip_serializing_if = "Option::is_none")]
105    pub assigned_molecule: Option<String>,
106}
107
108/// A FASTA distribution (specific file with contigs)
109#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
110pub struct FastaDistribution {
111    /// Unique identifier (e.g., "hs38DH")
112    pub id: String,
113    /// Display name (e.g., "hs38DH (1KG)")
114    pub display_name: String,
115    /// Source organization
116    pub source: ReferenceSource,
117    /// Download URL
118    #[serde(default, skip_serializing_if = "Option::is_none")]
119    pub download_url: Option<String>,
120    /// Tags (e.g., "`analysis_set`", "`with_decoy`")
121    #[serde(default, skip_serializing_if = "Vec::is_empty")]
122    pub tags: Vec<String>,
123    /// Contigs in this FASTA
124    pub contigs: Vec<FastaContig>,
125}
126
127impl FastaDistribution {
128    /// Count contigs by presence type
129    #[must_use]
130    pub fn presence_counts(&self) -> PresenceCounts {
131        let mut counts = PresenceCounts::default();
132        for contig in &self.contigs {
133            if contig.report_contig_id.is_some() {
134                counts.in_both += 1;
135            } else {
136                counts.fasta_only += 1;
137            }
138        }
139        counts
140    }
141}
142
143/// A contig in a FASTA distribution
144#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
145pub struct FastaContig {
146    /// Contig name as it appears in the FASTA
147    pub name: String,
148    /// Sequence length
149    pub length: u64,
150    /// MD5 checksum (required for matching)
151    pub md5: String,
152    /// Sort order (position in original file)
153    pub sort_order: u32,
154    /// Link to `ReportContig` (None for decoy/HLA not in assembly report)
155    #[serde(default, skip_serializing_if = "Option::is_none")]
156    pub report_contig_id: Option<u32>,
157    /// Alternative names
158    #[serde(default, skip_serializing_if = "Vec::is_empty")]
159    pub aliases: Vec<String>,
160}
161
162impl FastaContig {
163    #[cfg(test)]
164    pub fn new(name: impl Into<String>, length: u64, md5: impl Into<String>) -> Self {
165        Self {
166            name: name.into(),
167            length,
168            md5: md5.into(),
169            sort_order: 0,
170            report_contig_id: None,
171            aliases: Vec::new(),
172        }
173    }
174
175    /// Merge another contig's metadata into this one
176    ///
177    /// Used when building from multiple input files (e.g., dict + NCBI report)
178    ///
179    /// # Errors
180    ///
181    /// Returns an error if:
182    /// - Names don't match (`ContigMergeError::NameMismatch`)
183    /// - Lengths don't match (`ContigMergeError::LengthMismatch`)
184    /// - Both contigs have different non-empty MD5 checksums (`ContigMergeError::Md5Conflict`)
185    pub fn merge(&mut self, other: &FastaContig) -> Result<(), ContigMergeError> {
186        // Validate name and length match
187        if self.name != other.name {
188            return Err(ContigMergeError::NameMismatch {
189                expected: self.name.clone(),
190                found: other.name.clone(),
191            });
192        }
193        if self.length != other.length {
194            return Err(ContigMergeError::LengthMismatch {
195                name: self.name.clone(),
196                expected: self.length,
197                found: other.length,
198            });
199        }
200
201        // MD5: take first non-empty, error if conflict
202        if !other.md5.is_empty() {
203            if self.md5.is_empty() {
204                self.md5.clone_from(&other.md5);
205            } else if self.md5 != other.md5 {
206                return Err(ContigMergeError::Md5Conflict {
207                    name: self.name.clone(),
208                    existing: self.md5.clone(),
209                    incoming: other.md5.clone(),
210                });
211            }
212        }
213
214        // Aliases: union (excluding the contig name itself)
215        let existing: HashSet<_> = self.aliases.iter().cloned().collect();
216        for alias in &other.aliases {
217            if !existing.contains(alias) && alias != &self.name {
218                self.aliases.push(alias.clone());
219            }
220        }
221
222        // report_contig_id: take first non-null
223        if self.report_contig_id.is_none() {
224            self.report_contig_id = other.report_contig_id;
225        }
226
227        Ok(())
228    }
229}
230
231/// Counts of contigs by presence type
232#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
233pub struct PresenceCounts {
234    /// Contigs in both report and FASTA
235    pub in_both: usize,
236    /// Contigs only in assembly report
237    pub report_only: usize,
238    /// Contigs only in FASTA (decoy, HLA, etc.)
239    pub fasta_only: usize,
240}
241
242/// Error when merging contig metadata
243#[derive(Debug, Clone, PartialEq)]
244pub enum ContigMergeError {
245    NameMismatch {
246        expected: String,
247        found: String,
248    },
249    LengthMismatch {
250        name: String,
251        expected: u64,
252        found: u64,
253    },
254    Md5Conflict {
255        name: String,
256        existing: String,
257        incoming: String,
258    },
259}
260
261impl std::fmt::Display for ContigMergeError {
262    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
263        match self {
264            Self::NameMismatch { expected, found } => {
265                write!(
266                    f,
267                    "Contig name mismatch: expected '{expected}', found '{found}'"
268                )
269            }
270            Self::LengthMismatch {
271                name,
272                expected,
273                found,
274            } => {
275                write!(
276                    f,
277                    "Length mismatch for '{name}': expected {expected}, found {found}"
278                )
279            }
280            Self::Md5Conflict {
281                name,
282                existing,
283                incoming,
284            } => {
285                write!(
286                    f,
287                    "MD5 conflict for '{name}': existing={existing}, incoming={incoming}"
288                )
289            }
290        }
291    }
292}
293
294impl std::error::Error for ContigMergeError {}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    fn make_contig(name: &str, length: u64, md5: &str, aliases: Vec<&str>) -> FastaContig {
301        FastaContig {
302            name: name.to_string(),
303            length,
304            md5: md5.to_string(),
305            sort_order: 0,
306            report_contig_id: None,
307            aliases: aliases.into_iter().map(String::from).collect(),
308        }
309    }
310
311    #[test]
312    fn test_merge_adds_md5_when_missing() {
313        let mut base = make_contig("chr1", 1000, "", vec![]);
314        let other = make_contig("chr1", 1000, "abc123", vec![]);
315
316        base.merge(&other).unwrap();
317
318        assert_eq!(base.md5, "abc123");
319    }
320
321    #[test]
322    fn test_merge_keeps_existing_md5() {
323        let mut base = make_contig("chr1", 1000, "abc123", vec![]);
324        let other = make_contig("chr1", 1000, "", vec![]);
325
326        base.merge(&other).unwrap();
327
328        assert_eq!(base.md5, "abc123");
329    }
330
331    #[test]
332    fn test_merge_md5_conflict_errors() {
333        let mut base = make_contig("chr1", 1000, "abc123", vec![]);
334        let other = make_contig("chr1", 1000, "def456", vec![]);
335
336        let result = base.merge(&other);
337
338        assert!(matches!(result, Err(ContigMergeError::Md5Conflict { .. })));
339    }
340
341    #[test]
342    fn test_merge_md5_same_value_ok() {
343        let mut base = make_contig("chr1", 1000, "abc123", vec![]);
344        let other = make_contig("chr1", 1000, "abc123", vec![]);
345
346        base.merge(&other).unwrap();
347
348        assert_eq!(base.md5, "abc123");
349    }
350
351    #[test]
352    fn test_merge_unions_aliases() {
353        let mut base = make_contig("chr1", 1000, "", vec!["1", "NC_000001"]);
354        let other = make_contig("chr1", 1000, "", vec!["NC_000001", "CM000663"]);
355
356        base.merge(&other).unwrap();
357
358        assert_eq!(base.aliases.len(), 3);
359        assert!(base.aliases.contains(&"1".to_string()));
360        assert!(base.aliases.contains(&"NC_000001".to_string()));
361        assert!(base.aliases.contains(&"CM000663".to_string()));
362    }
363
364    #[test]
365    fn test_merge_excludes_name_from_aliases() {
366        let mut base = make_contig("chr1", 1000, "", vec![]);
367        let other = make_contig("chr1", 1000, "", vec!["chr1", "1"]);
368
369        base.merge(&other).unwrap();
370
371        assert_eq!(base.aliases, vec!["1"]);
372        assert!(!base.aliases.contains(&"chr1".to_string()));
373    }
374
375    #[test]
376    fn test_merge_length_mismatch_errors() {
377        let mut base = make_contig("chr1", 1000, "", vec![]);
378        let other = make_contig("chr1", 2000, "", vec![]);
379
380        let result = base.merge(&other);
381
382        assert!(matches!(
383            result,
384            Err(ContigMergeError::LengthMismatch { .. })
385        ));
386    }
387
388    #[test]
389    fn test_merge_name_mismatch_errors() {
390        let mut base = make_contig("chr1", 1000, "", vec![]);
391        let other = make_contig("chr2", 1000, "", vec![]);
392
393        let result = base.merge(&other);
394
395        assert!(matches!(result, Err(ContigMergeError::NameMismatch { .. })));
396    }
397
398    #[test]
399    fn test_merge_takes_first_report_contig_id() {
400        let mut base = make_contig("chr1", 1000, "", vec![]);
401        base.report_contig_id = Some(42);
402
403        let mut other = make_contig("chr1", 1000, "", vec![]);
404        other.report_contig_id = Some(99);
405
406        base.merge(&other).unwrap();
407
408        assert_eq!(base.report_contig_id, Some(42));
409    }
410
411    #[test]
412    fn test_merge_fills_missing_report_contig_id() {
413        let mut base = make_contig("chr1", 1000, "", vec![]);
414
415        let mut other = make_contig("chr1", 1000, "", vec![]);
416        other.report_contig_id = Some(42);
417
418        base.merge(&other).unwrap();
419
420        assert_eq!(base.report_contig_id, Some(42));
421    }
422}