1use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet};
14use std::path::Path;
15
16use crate::core::assembly::{
17 AssemblyVersion, FastaContig, FastaDistribution, HierarchicalAssembly,
18};
19
20#[inline]
22fn count_to_f64(count: usize) -> f64 {
23 #[allow(clippy::cast_precision_loss)]
24 {
25 count as f64
26 }
27}
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct HierarchicalCatalog {
32 pub version: String,
34 pub created_at: String,
36 pub assemblies: Vec<HierarchicalAssembly>,
38 #[serde(default, skip_serializing_if = "Vec::is_empty")]
40 pub standalone_distributions: Vec<FastaDistribution>,
41}
42
43impl Default for HierarchicalCatalog {
44 fn default() -> Self {
45 Self::new()
46 }
47}
48
49impl HierarchicalCatalog {
50 #[must_use]
51 pub fn new() -> Self {
52 Self {
53 version: "1.0.0".to_string(),
54 created_at: chrono::Utc::now().to_rfc3339(),
55 assemblies: Vec::new(),
56 standalone_distributions: Vec::new(),
57 }
58 }
59
60 #[must_use]
62 pub fn with_standalone_distribution(mut self, dist: FastaDistribution) -> Self {
63 self.standalone_distributions.push(dist);
64 self
65 }
66
67 #[must_use]
69 pub fn build_index(&self) -> CatalogIndex {
70 let mut index = CatalogIndex::default();
71
72 for assembly in &self.assemblies {
73 for version in &assembly.versions {
74 for dist in &version.fasta_distributions {
75 for (i, contig) in dist.contigs.iter().enumerate() {
76 let location = ContigLocation {
77 assembly_id: assembly.id.clone(),
78 version_id: version.id.clone(),
79 distribution_id: dist.id.clone(),
80 contig_index: i,
81 };
82
83 if !contig.md5.is_empty() {
85 index
86 .by_md5
87 .entry(contig.md5.clone())
88 .or_default()
89 .push(location.clone());
90 }
91
92 index
94 .by_name_length
95 .entry((contig.name.clone(), contig.length))
96 .or_default()
97 .push(location);
98 }
99 }
100 }
101 }
102
103 for dist in &self.standalone_distributions {
105 for (i, contig) in dist.contigs.iter().enumerate() {
106 let location = ContigLocation {
107 assembly_id: String::new(),
108 version_id: String::new(),
109 distribution_id: dist.id.clone(),
110 contig_index: i,
111 };
112
113 if !contig.md5.is_empty() {
114 index
115 .by_md5
116 .entry(contig.md5.clone())
117 .or_default()
118 .push(location.clone());
119 }
120
121 index
122 .by_name_length
123 .entry((contig.name.clone(), contig.length))
124 .or_default()
125 .push(location);
126 }
127 }
128
129 index
130 }
131
132 #[must_use]
134 pub fn get_distribution(&self, id: &str) -> Option<DistributionRef<'_>> {
135 for assembly in &self.assemblies {
136 for version in &assembly.versions {
137 for dist in &version.fasta_distributions {
138 if dist.id == id {
139 return Some(DistributionRef {
140 assembly: Some(assembly),
141 version: Some(version),
142 distribution: dist,
143 });
144 }
145 }
146 }
147 }
148
149 for dist in &self.standalone_distributions {
150 if dist.id == id {
151 return Some(DistributionRef {
152 assembly: None,
153 version: None,
154 distribution: dist,
155 });
156 }
157 }
158
159 None
160 }
161
162 pub fn load(path: &Path) -> Result<Self, std::io::Error> {
168 let file = std::fs::File::open(path)?;
169 let reader = std::io::BufReader::new(file);
170 serde_json::from_reader(reader)
171 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
172 }
173
174 pub fn save(&self, path: &Path) -> Result<(), std::io::Error> {
180 let file = std::fs::File::create(path)?;
181 let writer = std::io::BufWriter::new(file);
182 serde_json::to_writer_pretty(writer, self)
183 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
184 }
185
186 #[must_use]
200 pub fn infer_base_assembly(
201 &self,
202 contigs: &[FastaContig],
203 min_match_rate: f64,
204 ) -> Option<InferredAssembly> {
205 let input_md5s: HashSet<&str> = contigs
207 .iter()
208 .filter(|c| !c.md5.is_empty())
209 .map(|c| c.md5.as_str())
210 .collect();
211
212 if input_md5s.is_empty() {
213 return None;
214 }
215
216 let mut best_match: Option<InferredAssembly> = None;
217
218 for assembly in &self.assemblies {
220 for version in &assembly.versions {
221 let mut version_md5s: HashSet<&str> = HashSet::new();
223 for dist in &version.fasta_distributions {
224 for contig in &dist.contigs {
225 if !contig.md5.is_empty() {
226 version_md5s.insert(&contig.md5);
227 }
228 }
229 }
230
231 if version_md5s.is_empty() {
232 continue;
233 }
234
235 let matches = input_md5s.intersection(&version_md5s).count();
237 let match_rate = count_to_f64(matches) / count_to_f64(input_md5s.len());
238
239 if match_rate >= min_match_rate {
240 let is_better = best_match
241 .as_ref()
242 .map_or(true, |b| match_rate > b.match_rate);
243
244 if is_better {
245 best_match = Some(InferredAssembly {
246 assembly_id: assembly.id.clone(),
247 assembly_name: assembly.name.clone(),
248 version_id: version.id.clone(),
249 version_string: version.version.clone(),
250 match_rate,
251 matched_contigs: matches,
252 total_input_contigs: input_md5s.len(),
253 });
254 }
255 }
256 }
257 }
258
259 best_match
260 }
261
262 #[must_use]
264 pub fn infer_base_assembly_default(&self, contigs: &[FastaContig]) -> Option<InferredAssembly> {
265 self.infer_base_assembly(contigs, 0.9)
266 }
267}
268
269#[derive(Debug, Clone)]
271pub struct InferredAssembly {
272 pub assembly_id: String,
274 pub assembly_name: String,
276 pub version_id: String,
278 pub version_string: String,
280 pub match_rate: f64,
282 pub matched_contigs: usize,
284 pub total_input_contigs: usize,
286}
287
288#[derive(Debug, Clone)]
290pub struct DistributionRef<'a> {
291 pub assembly: Option<&'a HierarchicalAssembly>,
292 pub version: Option<&'a AssemblyVersion>,
293 pub distribution: &'a FastaDistribution,
294}
295
296#[derive(Debug, Default)]
298pub struct CatalogIndex {
299 pub by_md5: HashMap<String, Vec<ContigLocation>>,
301 pub by_name_length: HashMap<(String, u64), Vec<ContigLocation>>,
303}
304
305impl CatalogIndex {
306 #[must_use]
308 pub fn find_by_md5(&self, md5: &str) -> &[ContigLocation] {
309 self.by_md5.get(md5).map_or(&[], std::vec::Vec::as_slice)
310 }
311
312 #[must_use]
314 pub fn find_by_name_length(&self, name: &str, length: u64) -> &[ContigLocation] {
315 self.by_name_length
316 .get(&(name.to_string(), length))
317 .map_or(&[], std::vec::Vec::as_slice)
318 }
319}
320
321#[derive(Debug, Clone, PartialEq)]
323pub struct ContigLocation {
324 pub assembly_id: String,
326 pub version_id: String,
328 pub distribution_id: String,
330 pub contig_index: usize,
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337 use crate::core::assembly::{FastaContig, ReportSource};
338 use crate::core::types::ReferenceSource;
339
340 fn make_test_catalog() -> HierarchicalCatalog {
341 HierarchicalCatalog {
342 version: "1.0.0".to_string(),
343 created_at: "2024-01-01T00:00:00Z".to_string(),
344 assemblies: vec![HierarchicalAssembly {
345 id: "grch38".to_string(),
346 name: "GRCh38".to_string(),
347 organism: "Homo sapiens".to_string(),
348 versions: vec![AssemblyVersion {
349 id: "grch38_p14".to_string(),
350 version: "p14".to_string(),
351 source: ReportSource::Ncbi {
352 accession: "GCF_000001405.40".to_string(),
353 url: None,
354 date: None,
355 },
356 report_contigs: vec![],
357 fasta_distributions: vec![
358 FastaDistribution {
359 id: "hg38_ucsc".to_string(),
360 display_name: "hg38 (UCSC)".to_string(),
361 source: ReferenceSource::Ucsc,
362 download_url: None,
363 tags: vec![],
364 contigs: vec![FastaContig {
365 name: "chr1".to_string(),
366 length: 248_956_422,
367 md5: "6aef897c3d6ff0c78aff06ac189178dd".to_string(),
368 sort_order: 0,
369 report_contig_id: Some(1),
370 aliases: vec![],
371 }],
372 },
373 FastaDistribution {
374 id: "hs38DH".to_string(),
375 display_name: "hs38DH (1KG)".to_string(),
376 source: ReferenceSource::OneThousandGenomes,
377 download_url: None,
378 tags: vec![],
379 contigs: vec![FastaContig {
380 name: "chr1".to_string(),
381 length: 248_956_422,
382 md5: "6aef897c3d6ff0c78aff06ac189178dd".to_string(),
383 sort_order: 0,
384 report_contig_id: Some(1),
385 aliases: vec![],
386 }],
387 },
388 ],
389 }],
390 }],
391 standalone_distributions: vec![],
392 }
393 }
394
395 #[test]
396 fn test_index_by_md5() {
397 let catalog = make_test_catalog();
398 let index = catalog.build_index();
399
400 let locations = index.find_by_md5("6aef897c3d6ff0c78aff06ac189178dd");
401
402 assert_eq!(locations.len(), 2);
404 assert!(locations.iter().any(|l| l.distribution_id == "hg38_ucsc"));
405 assert!(locations.iter().any(|l| l.distribution_id == "hs38DH"));
406 }
407
408 #[test]
409 fn test_index_by_name_length() {
410 let catalog = make_test_catalog();
411 let index = catalog.build_index();
412
413 let locations = index.find_by_name_length("chr1", 248_956_422);
414
415 assert_eq!(locations.len(), 2);
416 }
417
418 #[test]
419 fn test_get_distribution() {
420 let catalog = make_test_catalog();
421
422 let dist_ref = catalog.get_distribution("hg38_ucsc").unwrap();
423 assert_eq!(dist_ref.distribution.id, "hg38_ucsc");
424 assert!(dist_ref.assembly.is_some());
425 assert!(dist_ref.version.is_some());
426 }
427
428 #[test]
429 fn test_save_and_load_roundtrip() {
430 let catalog = make_test_catalog();
431 let temp = tempfile::NamedTempFile::with_suffix(".json").unwrap();
432
433 catalog.save(temp.path()).unwrap();
434 let loaded = HierarchicalCatalog::load(temp.path()).unwrap();
435
436 assert_eq!(catalog.version, loaded.version);
437 assert_eq!(catalog.assemblies.len(), loaded.assemblies.len());
438 assert_eq!(
439 catalog.assemblies[0].versions[0].fasta_distributions.len(),
440 loaded.assemblies[0].versions[0].fasta_distributions.len()
441 );
442 }
443
444 #[test]
445 fn test_standalone_distribution() {
446 let mut catalog = HierarchicalCatalog::new();
447 catalog.standalone_distributions.push(FastaDistribution {
448 id: "custom_ref".to_string(),
449 display_name: "Custom Reference".to_string(),
450 source: ReferenceSource::Custom("Local".to_string()),
451 download_url: None,
452 tags: vec![],
453 contigs: vec![FastaContig::new("contig1", 1000, "abc123")],
454 });
455
456 let index = catalog.build_index();
457 let locations = index.find_by_md5("abc123");
458
459 assert_eq!(locations.len(), 1);
460 assert!(locations[0].assembly_id.is_empty());
461 assert_eq!(locations[0].distribution_id, "custom_ref");
462 }
463
464 fn make_catalog_for_inference() -> HierarchicalCatalog {
465 HierarchicalCatalog {
466 version: "1.0.0".to_string(),
467 created_at: "2024-01-01T00:00:00Z".to_string(),
468 assemblies: vec![
469 HierarchicalAssembly {
470 id: "grch38".to_string(),
471 name: "GRCh38".to_string(),
472 organism: "Homo sapiens".to_string(),
473 versions: vec![AssemblyVersion {
474 id: "grch38_p14".to_string(),
475 version: "p14".to_string(),
476 source: ReportSource::Ncbi {
477 accession: "GCF_000001405.40".to_string(),
478 url: None,
479 date: None,
480 },
481 report_contigs: vec![],
482 fasta_distributions: vec![FastaDistribution {
483 id: "hg38_ucsc".to_string(),
484 display_name: "hg38 (UCSC)".to_string(),
485 source: ReferenceSource::Ucsc,
486 download_url: None,
487 tags: vec![],
488 contigs: vec![
489 FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
490 FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
491 FastaContig::new("chr3", 198_295_559, "md5_grch38_chr3"),
492 ],
493 }],
494 }],
495 },
496 HierarchicalAssembly {
497 id: "grch37".to_string(),
498 name: "GRCh37".to_string(),
499 organism: "Homo sapiens".to_string(),
500 versions: vec![AssemblyVersion {
501 id: "grch37_p13".to_string(),
502 version: "p13".to_string(),
503 source: ReportSource::Ncbi {
504 accession: "GCF_000001405.25".to_string(),
505 url: None,
506 date: None,
507 },
508 report_contigs: vec![],
509 fasta_distributions: vec![FastaDistribution {
510 id: "hg19_ucsc".to_string(),
511 display_name: "hg19 (UCSC)".to_string(),
512 source: ReferenceSource::Ucsc,
513 download_url: None,
514 tags: vec![],
515 contigs: vec![
516 FastaContig::new("chr1", 249_250_621, "md5_grch37_chr1"),
517 FastaContig::new("chr2", 243_199_373, "md5_grch37_chr2"),
518 ],
519 }],
520 }],
521 },
522 ],
523 standalone_distributions: vec![],
524 }
525 }
526
527 #[test]
528 fn test_infer_base_assembly_exact_match() {
529 let catalog = make_catalog_for_inference();
530
531 let input_contigs = vec![
533 FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
534 FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
535 FastaContig::new("chr3", 198_295_559, "md5_grch38_chr3"),
536 ];
537
538 let inferred = catalog.infer_base_assembly(&input_contigs, 0.9).unwrap();
539
540 assert_eq!(inferred.assembly_id, "grch38");
541 assert_eq!(inferred.version_id, "grch38_p14");
542 assert_eq!(inferred.matched_contigs, 3);
543 assert!((inferred.match_rate - 1.0).abs() < 0.001);
544 }
545
546 #[test]
547 fn test_infer_base_assembly_partial_match() {
548 let catalog = make_catalog_for_inference();
549
550 let input_contigs = vec![
553 FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
554 FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
555 ];
556
557 let inferred = catalog.infer_base_assembly(&input_contigs, 0.9).unwrap();
559 assert_eq!(inferred.assembly_id, "grch38");
560 assert!((inferred.match_rate - 1.0).abs() < 0.001); let input_with_extra = vec![
564 FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
565 FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
566 FastaContig::new("extra_contig", 5000, "unknown_md5"),
567 ];
568
569 let inferred = catalog.infer_base_assembly(&input_with_extra, 0.9);
571 assert!(inferred.is_none());
572
573 let inferred = catalog.infer_base_assembly(&input_with_extra, 0.5).unwrap();
575 assert_eq!(inferred.assembly_id, "grch38");
576 assert!((inferred.match_rate - 0.6667).abs() < 0.01); }
578
579 #[test]
580 fn test_infer_base_assembly_no_match() {
581 let catalog = make_catalog_for_inference();
582
583 let input_contigs = vec![
585 FastaContig::new("chr1", 100_000, "unknown_md5_1"),
586 FastaContig::new("chr2", 200_000, "unknown_md5_2"),
587 ];
588
589 let inferred = catalog.infer_base_assembly(&input_contigs, 0.5);
590 assert!(inferred.is_none());
591 }
592
593 #[test]
594 fn test_infer_base_assembly_empty_input() {
595 let catalog = make_catalog_for_inference();
596
597 let input_contigs = vec![
599 FastaContig::new("chr1", 100_000, ""),
600 FastaContig::new("chr2", 200_000, ""),
601 ];
602
603 let inferred = catalog.infer_base_assembly(&input_contigs, 0.5);
604 assert!(inferred.is_none());
605 }
606
607 #[test]
608 fn test_infer_base_assembly_distinguishes_versions() {
609 let catalog = make_catalog_for_inference();
610
611 let input_contigs = vec![
613 FastaContig::new("chr1", 249_250_621, "md5_grch37_chr1"),
614 FastaContig::new("chr2", 243_199_373, "md5_grch37_chr2"),
615 ];
616
617 let inferred = catalog.infer_base_assembly(&input_contigs, 0.9).unwrap();
618
619 assert_eq!(inferred.assembly_id, "grch37");
620 assert_eq!(inferred.version_id, "grch37_p13");
621 assert_eq!(inferred.assembly_name, "GRCh37");
622 }
623
624 #[test]
625 fn test_infer_base_assembly_default_threshold() {
626 let catalog = make_catalog_for_inference();
627
628 let input_contigs = vec![
629 FastaContig::new("chr1", 248_956_422, "md5_grch38_chr1"),
630 FastaContig::new("chr2", 242_193_529, "md5_grch38_chr2"),
631 FastaContig::new("chr3", 198_295_559, "md5_grch38_chr3"),
632 ];
633
634 let inferred = catalog.infer_base_assembly_default(&input_contigs).unwrap();
635 assert_eq!(inferred.assembly_id, "grch38");
636 }
637}