Skip to main content

ref_solver/catalog/
hierarchical.rs

1//! Hierarchical catalog structure for assemblies and distributions.
2//!
3//! This module provides a hierarchical catalog structure:
4//! `HierarchicalCatalog` → `HierarchicalAssembly` → `AssemblyVersion` → `FastaDistribution`
5//!
6//! This structure allows:
7//! - Clear relationships between assemblies and their versions
8//! - Multiple FASTA distributions per assembly version
9//! - Per-contig presence tracking (in report, in FASTA, or both)
10//! - Fast lookup by MD5 or name+length
11
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet};
14use std::path::Path;
15
16use crate::core::assembly::{
17    AssemblyVersion, FastaContig, FastaDistribution, HierarchicalAssembly,
18};
19
20/// Helper function to convert usize count to f64 with explicit precision loss allowance
21#[inline]
22fn count_to_f64(count: usize) -> f64 {
23    #[allow(clippy::cast_precision_loss)]
24    {
25        count as f64
26    }
27}
28
29/// Hierarchical catalog with assemblies, versions, and distributions
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct HierarchicalCatalog {
32    /// Catalog schema version
33    pub version: String,
34    /// Creation timestamp
35    pub created_at: String,
36    /// All assemblies in the catalog
37    pub assemblies: Vec<HierarchicalAssembly>,
38    /// Standalone distributions without assembly reports
39    #[serde(default, skip_serializing_if = "Vec::is_empty")]
40    pub standalone_distributions: Vec<FastaDistribution>,
41}
42
43impl Default for HierarchicalCatalog {
44    fn default() -> Self {
45        Self::new()
46    }
47}
48
49impl HierarchicalCatalog {
50    #[must_use]
51    pub fn new() -> Self {
52        Self {
53            version: "1.0.0".to_string(),
54            created_at: chrono::Utc::now().to_rfc3339(),
55            assemblies: Vec::new(),
56            standalone_distributions: Vec::new(),
57        }
58    }
59
60    /// Add a standalone distribution (no assembly report)
61    #[must_use]
62    pub fn with_standalone_distribution(mut self, dist: FastaDistribution) -> Self {
63        self.standalone_distributions.push(dist);
64        self
65    }
66
67    /// Build indexes for fast matching
68    #[must_use]
69    pub fn build_index(&self) -> CatalogIndex {
70        let mut index = CatalogIndex::default();
71
72        for assembly in &self.assemblies {
73            for version in &assembly.versions {
74                for dist in &version.fasta_distributions {
75                    for (i, contig) in dist.contigs.iter().enumerate() {
76                        let location = ContigLocation {
77                            assembly_id: assembly.id.clone(),
78                            version_id: version.id.clone(),
79                            distribution_id: dist.id.clone(),
80                            contig_index: i,
81                        };
82
83                        // Index by MD5
84                        if !contig.md5.is_empty() {
85                            index
86                                .by_md5
87                                .entry(contig.md5.clone())
88                                .or_default()
89                                .push(location.clone());
90                        }
91
92                        // Index by name+length
93                        index
94                            .by_name_length
95                            .entry((contig.name.clone(), contig.length))
96                            .or_default()
97                            .push(location);
98                    }
99                }
100            }
101        }
102
103        // Also index standalone distributions
104        for dist in &self.standalone_distributions {
105            for (i, contig) in dist.contigs.iter().enumerate() {
106                let location = ContigLocation {
107                    assembly_id: String::new(),
108                    version_id: String::new(),
109                    distribution_id: dist.id.clone(),
110                    contig_index: i,
111                };
112
113                if !contig.md5.is_empty() {
114                    index
115                        .by_md5
116                        .entry(contig.md5.clone())
117                        .or_default()
118                        .push(location.clone());
119                }
120
121                index
122                    .by_name_length
123                    .entry((contig.name.clone(), contig.length))
124                    .or_default()
125                    .push(location);
126            }
127        }
128
129        index
130    }
131
132    /// Get a distribution by ID
133    #[must_use]
134    pub fn get_distribution(&self, id: &str) -> Option<DistributionRef<'_>> {
135        for assembly in &self.assemblies {
136            for version in &assembly.versions {
137                for dist in &version.fasta_distributions {
138                    if dist.id == id {
139                        return Some(DistributionRef {
140                            assembly: Some(assembly),
141                            version: Some(version),
142                            distribution: dist,
143                        });
144                    }
145                }
146            }
147        }
148
149        for dist in &self.standalone_distributions {
150            if dist.id == id {
151                return Some(DistributionRef {
152                    assembly: None,
153                    version: None,
154                    distribution: dist,
155                });
156            }
157        }
158
159        None
160    }
161
162    /// Load from JSON file
163    ///
164    /// # Errors
165    ///
166    /// Returns an error if the file cannot be read or contains invalid JSON.
167    pub fn load(path: &Path) -> Result<Self, std::io::Error> {
168        let file = std::fs::File::open(path)?;
169        let reader = std::io::BufReader::new(file);
170        serde_json::from_reader(reader)
171            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
172    }
173
174    /// Save to JSON file
175    ///
176    /// # Errors
177    ///
178    /// Returns an error if the file cannot be created or written.
179    pub fn save(&self, path: &Path) -> Result<(), std::io::Error> {
180        let file = std::fs::File::create(path)?;
181        let writer = std::io::BufWriter::new(file);
182        serde_json::to_writer_pretty(writer, self)
183            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
184    }
185
186    /// Infer the base assembly version for a set of contigs by MD5 matching.
187    ///
188    /// This method compares MD5 checksums from the input contigs against all known
189    /// distributions in the catalog. Returns the best matching assembly version
190    /// if the match rate exceeds the threshold.
191    ///
192    /// # Arguments
193    /// * `contigs` - The contigs to match (must have MD5 checksums)
194    /// * `min_match_rate` - Minimum fraction of contigs that must match (0.0 - 1.0)
195    ///
196    /// # Returns
197    /// * `Some(InferredAssembly)` if a match is found above the threshold
198    /// * `None` if no assembly matches well enough
199    #[must_use]
200    pub fn infer_base_assembly(
201        &self,
202        contigs: &[FastaContig],
203        min_match_rate: f64,
204    ) -> Option<InferredAssembly> {
205        // Collect MD5s from input contigs
206        let input_md5s: HashSet<&str> = contigs
207            .iter()
208            .filter(|c| !c.md5.is_empty())
209            .map(|c| c.md5.as_str())
210            .collect();
211
212        if input_md5s.is_empty() {
213            return None;
214        }
215
216        let mut best_match: Option<InferredAssembly> = None;
217
218        // Check each assembly version's distributions
219        for assembly in &self.assemblies {
220            for version in &assembly.versions {
221                // Collect all unique MD5s from this version's distributions
222                let mut version_md5s: HashSet<&str> = HashSet::new();
223                for dist in &version.fasta_distributions {
224                    for contig in &dist.contigs {
225                        if !contig.md5.is_empty() {
226                            version_md5s.insert(&contig.md5);
227                        }
228                    }
229                }
230
231                if version_md5s.is_empty() {
232                    continue;
233                }
234
235                // Count matches
236                let matches = input_md5s.intersection(&version_md5s).count();
237                let match_rate = count_to_f64(matches) / count_to_f64(input_md5s.len());
238
239                if match_rate >= min_match_rate {
240                    let is_better = best_match
241                        .as_ref()
242                        .map_or(true, |b| match_rate > b.match_rate);
243
244                    if is_better {
245                        best_match = Some(InferredAssembly {
246                            assembly_id: assembly.id.clone(),
247                            assembly_name: assembly.name.clone(),
248                            version_id: version.id.clone(),
249                            version_string: version.version.clone(),
250                            match_rate,
251                            matched_contigs: matches,
252                            total_input_contigs: input_md5s.len(),
253                        });
254                    }
255                }
256            }
257        }
258
259        best_match
260    }
261
262    /// Infer base assembly with default threshold (90%)
263    #[must_use]
264    pub fn infer_base_assembly_default(&self, contigs: &[FastaContig]) -> Option<InferredAssembly> {
265        self.infer_base_assembly(contigs, 0.9)
266    }
267}
268
269/// Result of inferring the base assembly for a set of contigs
270#[derive(Debug, Clone)]
271pub struct InferredAssembly {
272    /// Assembly ID (e.g., "grch38")
273    pub assembly_id: String,
274    /// Assembly name (e.g., "`GRCh38`")
275    pub assembly_name: String,
276    /// Version ID (e.g., "`grch38_p14`")
277    pub version_id: String,
278    /// Version string (e.g., "p14")
279    pub version_string: String,
280    /// Fraction of input contigs that matched
281    pub match_rate: f64,
282    /// Number of contigs that matched
283    pub matched_contigs: usize,
284    /// Total input contigs with MD5
285    pub total_input_contigs: usize,
286}
287
288/// Reference to a distribution with its context
289#[derive(Debug, Clone)]
290pub struct DistributionRef<'a> {
291    pub assembly: Option<&'a HierarchicalAssembly>,
292    pub version: Option<&'a AssemblyVersion>,
293    pub distribution: &'a FastaDistribution,
294}
295
296/// Indexes for fast lookup
297#[derive(Debug, Default)]
298pub struct CatalogIndex {
299    /// MD5 -> locations
300    pub by_md5: HashMap<String, Vec<ContigLocation>>,
301    /// (name, length) -> locations
302    pub by_name_length: HashMap<(String, u64), Vec<ContigLocation>>,
303}
304
305impl CatalogIndex {
306    /// Find all locations for a given MD5
307    #[must_use]
308    pub fn find_by_md5(&self, md5: &str) -> &[ContigLocation] {
309        self.by_md5.get(md5).map_or(&[], std::vec::Vec::as_slice)
310    }
311
312    /// Find all locations for a given name+length
313    #[must_use]
314    pub fn find_by_name_length(&self, name: &str, length: u64) -> &[ContigLocation] {
315        self.by_name_length
316            .get(&(name.to_string(), length))
317            .map_or(&[], std::vec::Vec::as_slice)
318    }
319}
320
321/// Location of a contig in the catalog
322#[derive(Debug, Clone, PartialEq)]
323pub struct ContigLocation {
324    /// Assembly ID (empty for standalone distributions)
325    pub assembly_id: String,
326    /// Version ID (empty for standalone distributions)
327    pub version_id: String,
328    /// Distribution ID
329    pub distribution_id: String,
330    /// Index in distribution's contig list
331    pub contig_index: usize,
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::core::assembly::{FastaContig, ReportSource};
338    use crate::core::types::ReferenceSource;
339
340    fn make_test_catalog() -> HierarchicalCatalog {
341        HierarchicalCatalog {
342            version: "1.0.0".to_string(),
343            created_at: "2024-01-01T00:00:00Z".to_string(),
344            assemblies: vec![HierarchicalAssembly {
345                id: "grch38".to_string(),
346                name: "GRCh38".to_string(),
347                organism: "Homo sapiens".to_string(),
348                versions: vec![AssemblyVersion {
349                    id: "grch38_p14".to_string(),
350                    version: "p14".to_string(),
351                    source: ReportSource::Ncbi {
352                        accession: "GCF_000001405.40".to_string(),
353                        url: None,
354                        date: None,
355                    },
356                    report_contigs: vec![],
357                    fasta_distributions: vec![
358                        FastaDistribution {
359                            id: "hg38_ucsc".to_string(),
360                            display_name: "hg38 (UCSC)".to_string(),
361                            source: ReferenceSource::Ucsc,
362                            download_url: None,
363                            tags: vec![],
364                            contigs: vec![FastaContig {
365                                name: "chr1".to_string(),
366                                length: 248_956_422,
367                                md5: "6aef897c3d6ff0c78aff06ac189178dd".to_string(),
368                                sort_order: 0,
369                                report_contig_id: Some(1),
370                                aliases: vec![],
371                            }],
372                        },
373                        FastaDistribution {
374                            id: "hs38DH".to_string(),
375                            display_name: "hs38DH (1KG)".to_string(),
376                            source: ReferenceSource::OneThousandGenomes,
377                            download_url: None,
378                            tags: vec![],
379                            contigs: vec![FastaContig {
380                                name: "chr1".to_string(),
381                                length: 248_956_422,
382                                md5: "6aef897c3d6ff0c78aff06ac189178dd".to_string(),
383                                sort_order: 0,
384                                report_contig_id: Some(1),
385                                aliases: vec![],
386                            }],
387                        },
388                    ],
389                }],
390            }],
391            standalone_distributions: vec![],
392        }
393    }
394
395    #[test]
396    fn test_index_by_md5() {
397        let catalog = make_test_catalog();
398        let index = catalog.build_index();
399
400        let locations = index.find_by_md5("6aef897c3d6ff0c78aff06ac189178dd");
401
402        // chr1 appears in both hg38_ucsc and hs38DH
403        assert_eq!(locations.len(), 2);
404        assert!(locations.iter().any(|l| l.distribution_id == "hg38_ucsc"));
405        assert!(locations.iter().any(|l| l.distribution_id == "hs38DH"));
406    }
407
408    #[test]
409    fn test_index_by_name_length() {
410        let catalog = make_test_catalog();
411        let index = catalog.build_index();
412
413        let locations = index.find_by_name_length("chr1", 248_956_422);
414
415        assert_eq!(locations.len(), 2);
416    }
417
418    #[test]
419    fn test_get_distribution() {
420        let catalog = make_test_catalog();
421
422        let dist_ref = catalog.get_distribution("hg38_ucsc").unwrap();
423        assert_eq!(dist_ref.distribution.id, "hg38_ucsc");
424        assert!(dist_ref.assembly.is_some());
425        assert!(dist_ref.version.is_some());
426    }
427
428    #[test]
429    fn test_save_and_load_roundtrip() {
430        let catalog = make_test_catalog();
431        let temp = tempfile::NamedTempFile::with_suffix(".json").unwrap();
432
433        catalog.save(temp.path()).unwrap();
434        let loaded = HierarchicalCatalog::load(temp.path()).unwrap();
435
436        assert_eq!(catalog.version, loaded.version);
437        assert_eq!(catalog.assemblies.len(), loaded.assemblies.len());
438        assert_eq!(
439            catalog.assemblies[0].versions[0].fasta_distributions.len(),
440            loaded.assemblies[0].versions[0].fasta_distributions.len()
441        );
442    }
443
444    #[test]
445    fn test_standalone_distribution() {
446        let mut catalog = HierarchicalCatalog::new();
447        catalog.standalone_distributions.push(FastaDistribution {
448            id: "custom_ref".to_string(),
449            display_name: "Custom Reference".to_string(),
450            source: ReferenceSource::Custom("Local".to_string()),
451            download_url: None,
452            tags: vec![],
453            contigs: vec![FastaContig::new("contig1", 1000, "abc123")],
454        });
455
456        let index = catalog.build_index();
457        let locations = index.find_by_md5("abc123");
458
459        assert_eq!(locations.len(), 1);
460        assert!(locations[0].assembly_id.is_empty());
461        assert_eq!(locations[0].distribution_id, "custom_ref");
462    }
463
464    fn make_catalog_for_inference() -> HierarchicalCatalog {
465        HierarchicalCatalog {
466            version: "1.0.0".to_string(),
467            created_at: "2024-01-01T00:00:00Z".to_string(),
468            assemblies: vec![
469                HierarchicalAssembly {
470                    id: "grch38".to_string(),
471                    name: "GRCh38".to_string(),
472                    organism: "Homo sapiens".to_string(),
473                    versions: vec![AssemblyVersion {
474                        id: "grch38_p14".to_string(),
475                        version: "p14".to_string(),
476                        source: ReportSource::Ncbi {
477                            accession: "GCF_000001405.40".to_string(),
478                            url: None,
479                            date: None,
480                        },
481                        report_contigs: vec![],
482                        fasta_distributions: vec![FastaDistribution {
483                            id: "hg38_ucsc".to_string(),
484                            display_name: "hg38 (UCSC)".to_string(),
485                            source: ReferenceSource::Ucsc,
486                            download_url: None,
487                            tags: vec![],
488                            contigs: vec![
489                                FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
490                                FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
491                                FastaContig::new("chr3", 198_295_559, "md5_grch38_chr3"),
492                            ],
493                        }],
494                    }],
495                },
496                HierarchicalAssembly {
497                    id: "grch37".to_string(),
498                    name: "GRCh37".to_string(),
499                    organism: "Homo sapiens".to_string(),
500                    versions: vec![AssemblyVersion {
501                        id: "grch37_p13".to_string(),
502                        version: "p13".to_string(),
503                        source: ReportSource::Ncbi {
504                            accession: "GCF_000001405.25".to_string(),
505                            url: None,
506                            date: None,
507                        },
508                        report_contigs: vec![],
509                        fasta_distributions: vec![FastaDistribution {
510                            id: "hg19_ucsc".to_string(),
511                            display_name: "hg19 (UCSC)".to_string(),
512                            source: ReferenceSource::Ucsc,
513                            download_url: None,
514                            tags: vec![],
515                            contigs: vec![
516                                FastaContig::new("chr1", 249_250_621, "md5_grch37_chr1"),
517                                FastaContig::new("chr2", 243_199_373, "md5_grch37_chr2"),
518                            ],
519                        }],
520                    }],
521                },
522            ],
523            standalone_distributions: vec![],
524        }
525    }
526
527    #[test]
528    fn test_infer_base_assembly_exact_match() {
529        let catalog = make_catalog_for_inference();
530
531        // Input with all GRCh38 MD5s
532        let input_contigs = vec![
533            FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
534            FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
535            FastaContig::new("chr3", 198_295_559, "md5_grch38_chr3"),
536        ];
537
538        let inferred = catalog.infer_base_assembly(&input_contigs, 0.9).unwrap();
539
540        assert_eq!(inferred.assembly_id, "grch38");
541        assert_eq!(inferred.version_id, "grch38_p14");
542        assert_eq!(inferred.matched_contigs, 3);
543        assert!((inferred.match_rate - 1.0).abs() < 0.001);
544    }
545
546    #[test]
547    fn test_infer_base_assembly_partial_match() {
548        let catalog = make_catalog_for_inference();
549
550        // Input with 2 out of 3 GRCh38 MD5s - but match_rate is based on input contigs
551        // So 2/2 = 100% of INPUT matched, even though only 2/3 of catalog matched
552        let input_contigs = vec![
553            FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
554            FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
555        ];
556
557        // With 90% threshold, should match since 100% of input matches
558        let inferred = catalog.infer_base_assembly(&input_contigs, 0.9).unwrap();
559        assert_eq!(inferred.assembly_id, "grch38");
560        assert!((inferred.match_rate - 1.0).abs() < 0.001); // 100% of input matched
561
562        // Now test with input that has non-matching contigs
563        let input_with_extra = vec![
564            FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
565            FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
566            FastaContig::new("extra_contig", 5000, "unknown_md5"),
567        ];
568
569        // 2/3 = 66.7% of input matched, below 90% threshold
570        let inferred = catalog.infer_base_assembly(&input_with_extra, 0.9);
571        assert!(inferred.is_none());
572
573        // With 50% threshold, should match
574        let inferred = catalog.infer_base_assembly(&input_with_extra, 0.5).unwrap();
575        assert_eq!(inferred.assembly_id, "grch38");
576        assert!((inferred.match_rate - 0.6667).abs() < 0.01); // ~66.7% matched
577    }
578
579    #[test]
580    fn test_infer_base_assembly_no_match() {
581        let catalog = make_catalog_for_inference();
582
583        // Input with completely different MD5s
584        let input_contigs = vec![
585            FastaContig::new("chr1", 100_000, "unknown_md5_1"),
586            FastaContig::new("chr2", 200_000, "unknown_md5_2"),
587        ];
588
589        let inferred = catalog.infer_base_assembly(&input_contigs, 0.5);
590        assert!(inferred.is_none());
591    }
592
593    #[test]
594    fn test_infer_base_assembly_empty_input() {
595        let catalog = make_catalog_for_inference();
596
597        // Input with no MD5s
598        let input_contigs = vec![
599            FastaContig::new("chr1", 100_000, ""),
600            FastaContig::new("chr2", 200_000, ""),
601        ];
602
603        let inferred = catalog.infer_base_assembly(&input_contigs, 0.5);
604        assert!(inferred.is_none());
605    }
606
607    #[test]
608    fn test_infer_base_assembly_distinguishes_versions() {
609        let catalog = make_catalog_for_inference();
610
611        // Input with GRCh37 MD5s
612        let input_contigs = vec![
613            FastaContig::new("chr1", 249_250_621, "md5_grch37_chr1"),
614            FastaContig::new("chr2", 243_199_373, "md5_grch37_chr2"),
615        ];
616
617        let inferred = catalog.infer_base_assembly(&input_contigs, 0.9).unwrap();
618
619        assert_eq!(inferred.assembly_id, "grch37");
620        assert_eq!(inferred.version_id, "grch37_p13");
621        assert_eq!(inferred.assembly_name, "GRCh37");
622    }
623
624    #[test]
625    fn test_infer_base_assembly_default_threshold() {
626        let catalog = make_catalog_for_inference();
627
628        let input_contigs = vec![
629            FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
630            FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
631            FastaContig::new("chr3", 198_295_559, "md5_grch38_chr3"),
632        ];
633
634        let inferred = catalog.infer_base_assembly_default(&input_contigs).unwrap();
635        assert_eq!(inferred.assembly_id, "grch38");
636    }
637}