1use std::collections::{HashMap, HashSet};
8use std::path::Path;
9use thiserror::Error;
10
11use crate::core::assembly::{ContigMergeError, FastaContig, FastaDistribution};
12use crate::core::contig::{Contig, SequenceRole};
13use crate::core::reference::KnownReference;
14use crate::core::types::{Assembly, ReferenceSource};
15
16#[derive(Error, Debug)]
17pub enum BuilderError {
18 #[error("IO error: {0}")]
19 Io(#[from] std::io::Error),
20
21 #[error("Parse error: {0}")]
22 Parse(String),
23
24 #[error("Conflict: {0}")]
25 Conflict(String),
26
27 #[error("Missing required field: {0}")]
28 MissingField(String),
29
30 #[error("Merge error: {0}")]
31 Merge(#[from] ContigMergeError),
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum InputFormat {
37 Dict,
38 Fai,
39 Fasta,
40 NcbiReport,
41 Sam,
42 Bam,
43 Cram,
44 Vcf,
45 Tsv,
46}
47
48impl InputFormat {
49 #[must_use]
51 pub fn from_path(path: &Path) -> Option<Self> {
52 let name = path.file_name()?.to_str()?;
53 let name_lower = name.to_lowercase();
54
55 #[allow(clippy::case_sensitive_file_extension_comparisons)]
58 if name_lower.contains("_assembly_report") && name_lower.ends_with(".txt") {
59 return Some(Self::NcbiReport);
60 }
61
62 if name_lower.ends_with(".fa.gz")
64 || name_lower.ends_with(".fasta.gz")
65 || name_lower.ends_with(".fna.gz")
66 || name_lower.ends_with(".fa.bgz")
67 || name_lower.ends_with(".fasta.bgz")
68 || name_lower.ends_with(".fna.bgz")
69 {
70 return Some(Self::Fasta);
71 }
72
73 let ext = path.extension()?.to_str()?.to_lowercase();
74 match ext.as_str() {
75 "dict" => Some(Self::Dict),
76 "fai" => Some(Self::Fai),
77 "fa" | "fasta" | "fna" => Some(Self::Fasta),
78 "sam" => Some(Self::Sam),
79 "bam" => Some(Self::Bam),
80 "cram" => Some(Self::Cram),
81 "vcf" => Some(Self::Vcf),
82 "tsv" | "txt" => Some(Self::Tsv),
83 "gz" => {
84 let stem = path.file_stem()?.to_str()?.to_lowercase();
86 #[allow(clippy::case_sensitive_file_extension_comparisons)]
87 if stem.ends_with(".vcf") {
88 Some(Self::Vcf)
89 } else {
90 None
91 }
92 }
93 _ => None,
94 }
95 }
96}
97
98#[derive(Debug, Clone)]
100pub struct ContigMetadata {
101 pub primary_name: String,
103
104 pub length: Option<u64>,
106
107 pub md5: Option<String>,
109
110 pub aliases: HashSet<String>,
112
113 pub assembly: Option<String>,
115
116 pub uri: Option<String>,
118
119 pub species: Option<String>,
121
122 pub sequence_role: SequenceRole,
124
125 pub sources: Vec<String>,
127}
128
129impl ContigMetadata {
130 fn new(name: String) -> Self {
131 Self {
132 primary_name: name,
133 length: None,
134 md5: None,
135 aliases: HashSet::new(),
136 assembly: None,
137 uri: None,
138 species: None,
139 sequence_role: SequenceRole::Unknown,
140 sources: Vec::new(),
141 }
142 }
143
144 fn to_contig(&self) -> Option<Contig> {
146 let length = self.length?;
147 let mut contig = Contig::new(&self.primary_name, length);
148 contig.md5.clone_from(&self.md5);
149 contig.assembly.clone_from(&self.assembly);
150 contig.uri.clone_from(&self.uri);
151 contig.species.clone_from(&self.species);
152 contig.aliases = self.aliases.iter().cloned().collect();
153 contig.sequence_role = self.sequence_role;
154 Some(contig)
155 }
156}
157
158#[derive(Debug, Clone)]
160pub struct InputRecord {
161 pub path: String,
162 pub format: InputFormat,
163 pub contigs_found: usize,
164 pub contigs_merged: usize,
165 pub aliases_added: usize,
166}
167
168pub struct ReferenceBuilder {
170 id: String,
171 display_name: String,
172 assembly: Option<Assembly>,
173 source: Option<ReferenceSource>,
174 description: Option<String>,
175 download_url: Option<String>,
176 assembly_report_url: Option<String>,
177 tags: Vec<String>,
178
179 contigs: HashMap<String, ContigMetadata>,
181
182 contig_order: Vec<String>,
184
185 alias_to_primary: HashMap<String, String>,
187
188 inputs_processed: Vec<InputRecord>,
190
191 conflicts: Vec<String>,
193
194 warnings: Vec<String>,
196
197 generate_ucsc_names: bool,
212}
213
214impl ReferenceBuilder {
215 pub fn new(id: impl Into<String>, display_name: impl Into<String>) -> Self {
217 Self {
218 id: id.into(),
219 display_name: display_name.into(),
220 assembly: None,
221 source: None,
222 description: None,
223 download_url: None,
224 assembly_report_url: None,
225 tags: Vec::new(),
226 contigs: HashMap::new(),
227 contig_order: Vec::new(),
228 alias_to_primary: HashMap::new(),
229 inputs_processed: Vec::new(),
230 conflicts: Vec::new(),
231 warnings: Vec::new(),
232 generate_ucsc_names: true, }
234 }
235
236 #[must_use]
256 pub fn generate_ucsc_names(mut self, generate: bool) -> Self {
257 self.generate_ucsc_names = generate;
258 self
259 }
260
261 #[must_use]
262 pub fn assembly(mut self, assembly: Assembly) -> Self {
263 self.assembly = Some(assembly);
264 self
265 }
266
267 #[must_use]
268 pub fn source(mut self, source: ReferenceSource) -> Self {
269 self.source = Some(source);
270 self
271 }
272
273 #[must_use]
274 pub fn description(mut self, description: impl Into<String>) -> Self {
275 self.description = Some(description.into());
276 self
277 }
278
279 #[must_use]
280 pub fn download_url(mut self, url: impl Into<String>) -> Self {
281 self.download_url = Some(url.into());
282 self
283 }
284
285 #[must_use]
286 pub fn assembly_report_url(mut self, url: impl Into<String>) -> Self {
287 self.assembly_report_url = Some(url.into());
288 self
289 }
290
291 #[must_use]
292 pub fn tags(mut self, tags: Vec<String>) -> Self {
293 self.tags = tags;
294 self
295 }
296
297 pub fn add_input(&mut self, path: &Path) -> Result<(), BuilderError> {
304 let format = InputFormat::from_path(path).ok_or_else(|| {
305 BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
306 })?;
307 self.add_input_with_format(path, format)
308 }
309
310 pub fn add_input_with_format(
317 &mut self,
318 path: &Path,
319 format: InputFormat,
320 ) -> Result<(), BuilderError> {
321 let path_str = path.display().to_string();
322
323 match format {
324 InputFormat::Dict | InputFormat::Sam => {
325 self.add_dict_or_sam(path, &path_str, format)?;
326 }
327 InputFormat::Bam => {
328 self.add_bam(path, &path_str)?;
329 }
330 InputFormat::Cram => {
331 self.add_cram(path, &path_str)?;
332 }
333 InputFormat::Fai => {
334 self.add_fai(path, &path_str)?;
335 }
336 InputFormat::Fasta => {
337 self.add_fasta(path, &path_str)?;
338 }
339 InputFormat::NcbiReport => {
340 self.add_ncbi_report(path, &path_str)?;
341 }
342 InputFormat::Vcf => {
343 self.add_vcf(path, &path_str)?;
344 }
345 InputFormat::Tsv => {
346 self.add_tsv(path, &path_str)?;
347 }
348 }
349
350 Ok(())
351 }
352
353 fn add_dict_or_sam(
354 &mut self,
355 path: &Path,
356 path_str: &str,
357 format: InputFormat,
358 ) -> Result<(), BuilderError> {
359 let content = std::fs::read_to_string(path)?;
360 let query = crate::parsing::sam::parse_header_text(&content)
361 .map_err(|e| BuilderError::Parse(e.to_string()))?;
362
363 let mut record = InputRecord {
364 path: path_str.to_string(),
365 format,
366 contigs_found: query.contigs.len(),
367 contigs_merged: 0,
368 aliases_added: 0,
369 };
370
371 for contig in query.contigs {
372 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
373 if merged {
374 record.contigs_merged += 1;
375 }
376 record.aliases_added += aliases;
377 }
378
379 self.inputs_processed.push(record);
380 Ok(())
381 }
382
383 fn add_bam(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
384 let query = crate::parsing::sam::parse_file(path)
385 .map_err(|e| BuilderError::Parse(e.to_string()))?;
386
387 let mut record = InputRecord {
388 path: path_str.to_string(),
389 format: InputFormat::Bam,
390 contigs_found: query.contigs.len(),
391 contigs_merged: 0,
392 aliases_added: 0,
393 };
394
395 for contig in query.contigs {
396 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
397 if merged {
398 record.contigs_merged += 1;
399 }
400 record.aliases_added += aliases;
401 }
402
403 self.inputs_processed.push(record);
404 Ok(())
405 }
406
407 fn add_cram(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
408 let query = crate::parsing::sam::parse_file(path)
409 .map_err(|e| BuilderError::Parse(e.to_string()))?;
410
411 let mut record = InputRecord {
412 path: path_str.to_string(),
413 format: InputFormat::Cram,
414 contigs_found: query.contigs.len(),
415 contigs_merged: 0,
416 aliases_added: 0,
417 };
418
419 for contig in query.contigs {
420 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
421 if merged {
422 record.contigs_merged += 1;
423 }
424 record.aliases_added += aliases;
425 }
426
427 self.inputs_processed.push(record);
428 Ok(())
429 }
430
431 fn add_fai(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
432 let content = std::fs::read_to_string(path)?;
433 let query = crate::parsing::fai::parse_fai_text(&content)
434 .map_err(|e| BuilderError::Parse(e.to_string()))?;
435
436 let mut record = InputRecord {
437 path: path_str.to_string(),
438 format: InputFormat::Fai,
439 contigs_found: query.contigs.len(),
440 contigs_merged: 0,
441 aliases_added: 0,
442 };
443
444 for contig in query.contigs {
445 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
446 if merged {
447 record.contigs_merged += 1;
448 }
449 record.aliases_added += aliases;
450 }
451
452 self.inputs_processed.push(record);
453 Ok(())
454 }
455
456 fn add_ncbi_report(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
457 let content = std::fs::read_to_string(path)?;
458 let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
459 .map_err(|e| BuilderError::Parse(e.to_string()))?;
460
461 let mut record = InputRecord {
462 path: path_str.to_string(),
463 format: InputFormat::NcbiReport,
464 contigs_found: entries.len(),
465 contigs_merged: 0,
466 aliases_added: 0,
467 };
468
469 for entry in entries {
470 let contig = entry.to_contig_with_options(self.generate_ucsc_names);
472 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
473 if merged {
474 record.contigs_merged += 1;
475 }
476 record.aliases_added += aliases;
477 }
478
479 self.inputs_processed.push(record);
480 Ok(())
481 }
482
483 fn add_vcf(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
484 let content = std::fs::read_to_string(path)?;
485 let query = crate::parsing::vcf::parse_vcf_header_text(&content)
486 .map_err(|e| BuilderError::Parse(e.to_string()))?;
487
488 let mut record = InputRecord {
489 path: path_str.to_string(),
490 format: InputFormat::Vcf,
491 contigs_found: query.contigs.len(),
492 contigs_merged: 0,
493 aliases_added: 0,
494 };
495
496 for contig in query.contigs {
497 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
498 if merged {
499 record.contigs_merged += 1;
500 }
501 record.aliases_added += aliases;
502 }
503
504 self.inputs_processed.push(record);
505 Ok(())
506 }
507
508 fn add_tsv(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
509 let content = std::fs::read_to_string(path)?;
510 let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
511 .map_err(|e| BuilderError::Parse(e.to_string()))?;
512
513 let mut record = InputRecord {
514 path: path_str.to_string(),
515 format: InputFormat::Tsv,
516 contigs_found: query.contigs.len(),
517 contigs_merged: 0,
518 aliases_added: 0,
519 };
520
521 for contig in query.contigs {
522 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
523 if merged {
524 record.contigs_merged += 1;
525 }
526 record.aliases_added += aliases;
527 }
528
529 self.inputs_processed.push(record);
530 Ok(())
531 }
532
533 fn add_fasta(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
534 let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
535 .map_err(|e| BuilderError::Parse(e.to_string()))?;
536
537 let mut record = InputRecord {
538 path: path_str.to_string(),
539 format: InputFormat::Fasta,
540 contigs_found: query.contigs.len(),
541 contigs_merged: 0,
542 aliases_added: 0,
543 };
544
545 for contig in query.contigs {
546 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
547 if merged {
548 record.contigs_merged += 1;
549 }
550 record.aliases_added += aliases;
551 }
552
553 self.inputs_processed.push(record);
554 Ok(())
555 }
556
557 fn merge_contig(
560 &mut self,
561 contig: &Contig,
562 source: &str,
563 ) -> Result<(bool, usize), BuilderError> {
564 let mut existing_primary = self.find_existing_primary(&contig.name);
566
567 if existing_primary.is_none() {
569 for alias in &contig.aliases {
570 if let Some(primary) = self.find_existing_primary(alias) {
571 existing_primary = Some(primary);
572 break;
573 }
574 }
575 }
576
577 if let Some(primary) = existing_primary {
578 let metadata = self.contigs.get_mut(&primary).unwrap();
580 let aliases_before = metadata.aliases.len();
581
582 if let Some(existing_len) = metadata.length {
584 if existing_len != contig.length {
585 let msg = format!(
586 "Length conflict for '{}': {} vs {} (from {})",
587 contig.name, existing_len, contig.length, source
588 );
589 self.conflicts.push(msg.clone());
590 return Err(BuilderError::Conflict(msg));
591 }
592 } else {
593 metadata.length = Some(contig.length);
594 }
595
596 if let (Some(existing_md5), Some(new_md5)) = (&metadata.md5, &contig.md5) {
598 if existing_md5 != new_md5 {
599 let msg = format!(
600 "MD5 conflict for '{}': {} vs {} (from {})",
601 contig.name, existing_md5, new_md5, source
602 );
603 self.conflicts.push(msg.clone());
604 return Err(BuilderError::Conflict(msg));
605 }
606 } else if metadata.md5.is_none() && contig.md5.is_some() {
607 metadata.md5.clone_from(&contig.md5);
608 }
609
610 for alias in &contig.aliases {
612 if !metadata.aliases.contains(alias) && alias != &metadata.primary_name {
613 if let Some(other_primary) = self.alias_to_primary.get(alias) {
615 if other_primary != &primary {
616 self.warnings.push(format!(
617 "Alias '{alias}' already mapped to '{other_primary}', skipping for '{primary}'"
618 ));
619 continue;
620 }
621 }
622 metadata.aliases.insert(alias.clone());
623 self.alias_to_primary.insert(alias.clone(), primary.clone());
624 }
625 }
626
627 if contig.name != primary && !metadata.aliases.contains(&contig.name) {
629 metadata.aliases.insert(contig.name.clone());
630 self.alias_to_primary
631 .insert(contig.name.clone(), primary.clone());
632 }
633
634 if metadata.assembly.is_none() && contig.assembly.is_some() {
636 metadata.assembly.clone_from(&contig.assembly);
637 }
638 if metadata.uri.is_none() && contig.uri.is_some() {
639 metadata.uri.clone_from(&contig.uri);
640 }
641 if metadata.species.is_none() && contig.species.is_some() {
642 metadata.species.clone_from(&contig.species);
643 }
644 if matches!(metadata.sequence_role, SequenceRole::Unknown) {
646 metadata.sequence_role = contig.sequence_role;
647 }
648
649 metadata.sources.push(source.to_string());
650
651 let aliases_added = metadata.aliases.len() - aliases_before;
652 Ok((true, aliases_added))
653 } else {
654 let mut metadata = ContigMetadata::new(contig.name.clone());
656 metadata.length = Some(contig.length);
657 metadata.md5.clone_from(&contig.md5);
658 metadata.assembly.clone_from(&contig.assembly);
659 metadata.uri.clone_from(&contig.uri);
660 metadata.species.clone_from(&contig.species);
661 metadata.sequence_role = contig.sequence_role;
662 metadata.sources.push(source.to_string());
663
664 let mut aliases_added = 0;
666 for alias in &contig.aliases {
667 if alias != &contig.name {
668 if let Some(other_primary) = self.alias_to_primary.get(alias) {
669 self.warnings.push(format!(
670 "Alias '{}' already mapped to '{}', skipping for '{}'",
671 alias, other_primary, contig.name
672 ));
673 continue;
674 }
675 metadata.aliases.insert(alias.clone());
676 self.alias_to_primary
677 .insert(alias.clone(), contig.name.clone());
678 aliases_added += 1;
679 }
680 }
681
682 self.contig_order.push(contig.name.clone());
683 self.contigs.insert(contig.name.clone(), metadata);
684
685 Ok((false, aliases_added))
686 }
687 }
688
689 fn find_existing_primary(&self, name: &str) -> Option<String> {
691 if self.contigs.contains_key(name) {
693 return Some(name.to_string());
694 }
695
696 if let Some(primary) = self.alias_to_primary.get(name) {
698 return Some(primary.clone());
699 }
700
701 None
702 }
703
704 pub fn build(self) -> Result<KnownReference, BuilderError> {
711 if self.contigs.is_empty() {
713 return Err(BuilderError::MissingField("No contigs added".to_string()));
714 }
715
716 let mut missing_length = Vec::new();
718 for name in &self.contig_order {
719 if let Some(metadata) = self.contigs.get(name) {
720 if metadata.length.is_none() {
721 missing_length.push(name.clone());
722 }
723 }
724 }
725 if !missing_length.is_empty() {
726 return Err(BuilderError::MissingField(format!(
727 "Missing length for contigs: {missing_length:?}"
728 )));
729 }
730
731 let contigs: Vec<Contig> = self
733 .contig_order
734 .iter()
735 .filter_map(|name| self.contigs.get(name))
736 .filter_map(ContigMetadata::to_contig)
737 .collect();
738
739 let mut contigs_missing_from_fasta: Vec<String> = Vec::new();
742 for (name, meta) in &self.contigs {
743 if meta.md5.is_none() && matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
744 contigs_missing_from_fasta.push(name.clone());
745 }
746 }
747 contigs_missing_from_fasta.sort();
748
749 let assembly = self
751 .assembly
752 .unwrap_or_else(|| detect_assembly_from_name(&self.display_name));
753
754 let source = self
756 .source
757 .unwrap_or(ReferenceSource::Custom("Unknown".to_string()));
758
759 let naming_convention = crate::core::contig::detect_naming_convention(&contigs);
761
762 let mut reference = KnownReference {
763 id: crate::core::types::ReferenceId::new(&self.id),
764 display_name: self.display_name,
765 assembly,
766 source,
767 naming_convention,
768 download_url: self.download_url,
769 assembly_report_url: self.assembly_report_url,
770 contigs,
771 description: self.description,
772 tags: self.tags,
773 contigs_missing_from_fasta,
774 md5_set: HashSet::new(),
775 name_length_set: HashSet::new(),
776 signature: None,
777 };
778
779 reference.rebuild_indexes();
780 Ok(reference)
781 }
782
783 #[must_use]
785 pub fn summary(&self) -> BuildSummary {
786 let total_contigs = self.contigs.len();
787 let with_length = self.contigs.values().filter(|m| m.length.is_some()).count();
788 let with_md5 = self.contigs.values().filter(|m| m.md5.is_some()).count();
789 let with_aliases = self
790 .contigs
791 .values()
792 .filter(|m| !m.aliases.is_empty())
793 .count();
794
795 let mut missing_md5_assembled: Vec<String> = Vec::new();
797 for (name, meta) in &self.contigs {
798 if meta.md5.is_none() {
799 if matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
801 missing_md5_assembled.push(name.clone());
802 }
803 }
804 }
805
806 let primary_count = self
808 .contigs
809 .keys()
810 .filter(|name| {
811 let contig = Contig::new(name.as_str(), 0);
812 contig.is_primary_chromosome()
813 })
814 .count();
815
816 let alt_count = self
818 .contigs
819 .keys()
820 .filter(|name| name.ends_with("_alt") || name.contains("_alt_"))
821 .count();
822
823 BuildSummary {
824 id: self.id.clone(),
825 display_name: self.display_name.clone(),
826 assembly: self.assembly.clone(),
827 source: self.source.clone(),
828 inputs: self.inputs_processed.clone(),
829 total_contigs,
830 primary_chromosomes: primary_count,
831 alt_contigs: alt_count,
832 other_contigs: total_contigs.saturating_sub(primary_count + alt_count),
833 with_length,
834 with_md5,
835 with_aliases,
836 missing_md5_assembled,
837 conflicts: self.conflicts.clone(),
838 warnings: self.warnings.clone(),
839 }
840 }
841}
842
843#[derive(Debug, Clone)]
845pub struct BuildSummary {
846 pub id: String,
847 pub display_name: String,
848 pub assembly: Option<Assembly>,
849 pub source: Option<ReferenceSource>,
850 pub inputs: Vec<InputRecord>,
851 pub total_contigs: usize,
852 pub primary_chromosomes: usize,
853 pub alt_contigs: usize,
854 pub other_contigs: usize,
855 pub with_length: usize,
856 pub with_md5: usize,
857 pub with_aliases: usize,
858 pub missing_md5_assembled: Vec<String>,
859 pub conflicts: Vec<String>,
860 pub warnings: Vec<String>,
861}
862
863impl std::fmt::Display for BuildSummary {
864 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
865 writeln!(f, "Reference Builder Summary")?;
866 writeln!(f, "=========================")?;
867 writeln!(f, "ID: {}", self.id)?;
868 writeln!(f, "Name: {}", self.display_name)?;
869 if let Some(ref assembly) = self.assembly {
870 writeln!(f, "Assembly: {assembly:?}")?;
871 }
872 if let Some(ref source) = self.source {
873 writeln!(f, "Source: {source:?}")?;
874 }
875 writeln!(f)?;
876
877 writeln!(f, "Inputs:")?;
878 for (i, input) in self.inputs.iter().enumerate() {
879 writeln!(
880 f,
881 " [{}] {} ({:?}) -> {} contigs, {} merged, {} aliases",
882 i + 1,
883 input.path,
884 input.format,
885 input.contigs_found,
886 input.contigs_merged,
887 input.aliases_added
888 )?;
889 }
890 writeln!(f)?;
891
892 writeln!(f, "Contigs: {} total", self.total_contigs)?;
893 writeln!(f, " - Primary chromosomes: {}", self.primary_chromosomes)?;
894 writeln!(f, " - ALT contigs: {}", self.alt_contigs)?;
895 writeln!(f, " - Other: {}", self.other_contigs)?;
896 writeln!(f)?;
897
898 writeln!(f, "Coverage:")?;
899 let pct = |n: usize, total: usize| {
900 if total == 0 {
901 0
902 } else {
903 (n * 100) / total
904 }
905 };
906 let check = |n: usize, total: usize| {
907 if n == total {
908 "+"
909 } else {
910 "o"
911 }
912 };
913 writeln!(
914 f,
915 " {} Length: {}/{} ({}%)",
916 check(self.with_length, self.total_contigs),
917 self.with_length,
918 self.total_contigs,
919 pct(self.with_length, self.total_contigs)
920 )?;
921 writeln!(
922 f,
923 " {} MD5: {}/{} ({}%)",
924 check(self.with_md5, self.total_contigs),
925 self.with_md5,
926 self.total_contigs,
927 pct(self.with_md5, self.total_contigs)
928 )?;
929 writeln!(
930 f,
931 " {} Aliases: {}/{} ({}%)",
932 check(self.with_aliases, self.total_contigs),
933 self.with_aliases,
934 self.total_contigs,
935 pct(self.with_aliases, self.total_contigs)
936 )?;
937 writeln!(f)?;
938
939 writeln!(f, "Conflicts: {}", self.conflicts.len())?;
940 for conflict in &self.conflicts {
941 writeln!(f, " - {conflict}")?;
942 }
943
944 let total_warnings = self.warnings.len() + self.missing_md5_assembled.len();
945 writeln!(f, "Warnings: {total_warnings}")?;
946 for warning in &self.warnings {
947 writeln!(f, " - {warning}")?;
948 }
949 if !self.missing_md5_assembled.is_empty() {
950 writeln!(
951 f,
952 " - Missing MD5 for assembled-molecule contigs: {}",
953 self.missing_md5_assembled.join(", ")
954 )?;
955 }
956
957 Ok(())
958 }
959}
960
961#[derive(Debug)]
969pub struct DistributionBuilder {
970 id: String,
971 display_name: String,
972 source: ReferenceSource,
973 download_url: Option<String>,
974 tags: Vec<String>,
975
976 contigs: HashMap<(String, u64), FastaContig>,
978
979 insertion_order: Vec<(String, u64)>,
981
982 source_files: Vec<String>,
984
985 generate_ucsc_names: bool,
987}
988
989impl Default for DistributionBuilder {
990 fn default() -> Self {
991 Self::new("")
992 }
993}
994
995impl DistributionBuilder {
996 pub fn new(id: impl Into<String>) -> Self {
998 Self {
999 id: id.into(),
1000 display_name: String::new(),
1001 source: ReferenceSource::Custom("custom".to_string()),
1002 download_url: None,
1003 tags: Vec::new(),
1004 contigs: HashMap::new(),
1005 insertion_order: Vec::new(),
1006 source_files: Vec::new(),
1007 generate_ucsc_names: true, }
1009 }
1010
1011 #[must_use]
1015 pub fn with_generate_ucsc_names(mut self, generate: bool) -> Self {
1016 self.generate_ucsc_names = generate;
1017 self
1018 }
1019
1020 #[must_use]
1022 pub fn with_display_name(mut self, name: impl Into<String>) -> Self {
1023 self.display_name = name.into();
1024 self
1025 }
1026
1027 #[must_use]
1029 pub fn with_source(mut self, source: ReferenceSource) -> Self {
1030 self.source = source;
1031 self
1032 }
1033
1034 #[must_use]
1036 pub fn with_download_url(mut self, url: impl Into<String>) -> Self {
1037 self.download_url = Some(url.into());
1038 self
1039 }
1040
1041 #[must_use]
1043 pub fn with_tags(mut self, tags: Vec<String>) -> Self {
1044 self.tags = tags;
1045 self
1046 }
1047
1048 pub fn add_input(&mut self, path: &Path) -> Result<&mut Self, BuilderError> {
1055 let format = InputFormat::from_path(path).ok_or_else(|| {
1056 BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
1057 })?;
1058 self.add_input_with_format(path, format)
1059 }
1060
1061 pub fn add_input_with_format(
1068 &mut self,
1069 path: &Path,
1070 format: InputFormat,
1071 ) -> Result<&mut Self, BuilderError> {
1072 let path_str = path.display().to_string();
1073 self.source_files.push(path_str.clone());
1074
1075 let contigs = self.parse_input(path, format)?;
1076
1077 for contig in contigs {
1078 let key = (contig.name.clone(), contig.length);
1079 #[allow(clippy::cast_possible_truncation)] let fasta_contig = FastaContig {
1081 name: contig.name,
1082 length: contig.length,
1083 md5: contig.md5.unwrap_or_default(),
1084 sort_order: self.insertion_order.len() as u32,
1085 report_contig_id: None,
1086 aliases: contig.aliases,
1087 };
1088
1089 if let Some(existing) = self.contigs.get_mut(&key) {
1090 existing.merge(&fasta_contig)?;
1091 } else {
1092 self.insertion_order.push(key.clone());
1093 self.contigs.insert(key, fasta_contig);
1094 }
1095 }
1096
1097 Ok(self)
1098 }
1099
1100 fn parse_input(&self, path: &Path, format: InputFormat) -> Result<Vec<Contig>, BuilderError> {
1102 match format {
1103 InputFormat::Dict | InputFormat::Sam => {
1104 let content = std::fs::read_to_string(path)?;
1105 let query = crate::parsing::sam::parse_header_text(&content)
1106 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1107 Ok(query.contigs)
1108 }
1109 InputFormat::Bam | InputFormat::Cram => {
1110 let query = crate::parsing::sam::parse_file(path)
1111 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1112 Ok(query.contigs)
1113 }
1114 InputFormat::Fai => {
1115 let content = std::fs::read_to_string(path)?;
1116 let query = crate::parsing::fai::parse_fai_text(&content)
1117 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1118 Ok(query.contigs)
1119 }
1120 InputFormat::Fasta => {
1121 let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
1123 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1124 Ok(query.contigs)
1125 }
1126 InputFormat::NcbiReport => {
1127 let content = std::fs::read_to_string(path)?;
1128 let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
1129 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1130 Ok(entries
1132 .into_iter()
1133 .map(|e| e.to_contig_with_options(self.generate_ucsc_names))
1134 .collect())
1135 }
1136 InputFormat::Vcf => {
1137 let content = std::fs::read_to_string(path)?;
1138 let query = crate::parsing::vcf::parse_vcf_header_text(&content)
1139 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1140 Ok(query.contigs)
1141 }
1142 InputFormat::Tsv => {
1143 let content = std::fs::read_to_string(path)?;
1144 let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
1145 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1146 Ok(query.contigs)
1147 }
1148 }
1149 }
1150
1151 pub fn build(self) -> Result<FastaDistribution, BuilderError> {
1157 if self.contigs.is_empty() {
1158 return Err(BuilderError::MissingField("No contigs found".to_string()));
1159 }
1160
1161 let mut contigs: Vec<FastaContig> = Vec::with_capacity(self.insertion_order.len());
1163 for (i, key) in self.insertion_order.iter().enumerate() {
1164 if let Some(mut contig) = self.contigs.get(key).cloned() {
1165 #[allow(clippy::cast_possible_truncation)] {
1167 contig.sort_order = i as u32;
1168 }
1169 contigs.push(contig);
1170 }
1171 }
1172
1173 Ok(FastaDistribution {
1174 id: self.id,
1175 display_name: self.display_name,
1176 source: self.source,
1177 download_url: self.download_url,
1178 tags: self.tags,
1179 contigs,
1180 })
1181 }
1182}
1183
1184#[must_use]
1186pub fn detect_assembly_from_name(display_name: &str) -> Assembly {
1187 let lower = display_name.to_lowercase();
1188 if lower.contains("chm13") || lower.contains("t2t") {
1189 Assembly::Other("CHM13".to_string())
1190 } else if lower.contains("38") || lower.contains("hg38") || lower.contains("hs38") {
1191 Assembly::Grch38
1192 } else if lower.contains("37")
1193 || lower.contains("19")
1194 || lower.contains("hg19")
1195 || lower.contains("hs37")
1196 || lower.contains("b37")
1197 {
1198 Assembly::Grch37
1199 } else {
1200 Assembly::Other(display_name.to_string())
1201 }
1202}
1203
1204#[cfg(test)]
1205mod tests {
1206 use super::*;
1207
1208 #[test]
1209 fn test_input_format_detection() {
1210 assert_eq!(
1211 InputFormat::from_path(Path::new("test.dict")),
1212 Some(InputFormat::Dict)
1213 );
1214 assert_eq!(
1215 InputFormat::from_path(Path::new("test.fai")),
1216 Some(InputFormat::Fai)
1217 );
1218 assert_eq!(
1219 InputFormat::from_path(Path::new("test.fa")),
1220 Some(InputFormat::Fasta)
1221 );
1222 assert_eq!(
1223 InputFormat::from_path(Path::new("test.fasta")),
1224 Some(InputFormat::Fasta)
1225 );
1226 assert_eq!(
1227 InputFormat::from_path(Path::new("test.fna")),
1228 Some(InputFormat::Fasta)
1229 );
1230 assert_eq!(
1231 InputFormat::from_path(Path::new("test.fa.gz")),
1232 Some(InputFormat::Fasta)
1233 );
1234 assert_eq!(
1235 InputFormat::from_path(Path::new("test.vcf")),
1236 Some(InputFormat::Vcf)
1237 );
1238 assert_eq!(
1239 InputFormat::from_path(Path::new("test.vcf.gz")),
1240 Some(InputFormat::Vcf)
1241 );
1242 assert_eq!(
1243 InputFormat::from_path(Path::new("GRCh38_assembly_report.txt")),
1244 Some(InputFormat::NcbiReport)
1245 );
1246 }
1247
1248 #[test]
1249 fn test_detect_assembly() {
1250 assert!(matches!(
1251 detect_assembly_from_name("GRCh38 (Broad)"),
1252 Assembly::Grch38
1253 ));
1254 assert!(matches!(
1255 detect_assembly_from_name("hg38 UCSC"),
1256 Assembly::Grch38
1257 ));
1258 assert!(matches!(
1259 detect_assembly_from_name("GRCh37"),
1260 Assembly::Grch37
1261 ));
1262 assert!(matches!(
1263 detect_assembly_from_name("hg19"),
1264 Assembly::Grch37
1265 ));
1266 assert!(matches!(
1267 detect_assembly_from_name("T2T-CHM13"),
1268 Assembly::Other(_)
1269 ));
1270 }
1271
1272 #[test]
1273 fn test_builder_single_input() {
1274 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1275 .assembly(Assembly::Grch38)
1276 .source(ReferenceSource::Custom("test".to_string()));
1277
1278 let contig = Contig::new("chr1", 248_956_422);
1280 builder.merge_contig(&contig, "test").unwrap();
1281
1282 let reference = builder.build().unwrap();
1283 assert_eq!(reference.id.0, "test_ref");
1284 assert_eq!(reference.contigs.len(), 1);
1285 assert_eq!(reference.contigs[0].name, "chr1");
1286 }
1287
1288 #[test]
1289 fn test_builder_merge_with_aliases() {
1290 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1291
1292 let contig1 = Contig::new("chr1", 248_956_422);
1294 builder.merge_contig(&contig1, "source1").unwrap();
1295
1296 let mut contig2 = Contig::new("1", 248_956_422);
1298 contig2.aliases = vec!["chr1".to_string()];
1299 builder.merge_contig(&contig2, "source2").unwrap();
1300
1301 let summary = builder.summary();
1303 assert_eq!(summary.total_contigs, 1);
1304
1305 let reference = builder.build().unwrap();
1306 assert_eq!(reference.contigs.len(), 1);
1307 assert!(reference.contigs[0].aliases.contains(&"1".to_string()));
1309 }
1310
1311 #[test]
1312 fn test_builder_conflict_detection() {
1313 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1314
1315 let contig1 = Contig::new("chr1", 248_956_422);
1317 builder.merge_contig(&contig1, "source1").unwrap();
1318
1319 let contig2 = Contig::new("chr1", 100_000);
1321 let result = builder.merge_contig(&contig2, "source2");
1322 assert!(result.is_err());
1323 assert!(matches!(result.unwrap_err(), BuilderError::Conflict(_)));
1324 }
1325
1326 #[test]
1327 fn test_distribution_builder_single_dict() {
1328 use std::io::Write;
1329 use tempfile::NamedTempFile;
1330
1331 let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1332 writeln!(file, "@HD\tVN:1.6").unwrap();
1333 writeln!(
1334 file,
1335 "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1336 )
1337 .unwrap();
1338 writeln!(
1339 file,
1340 "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c"
1341 )
1342 .unwrap();
1343
1344 let mut builder = DistributionBuilder::new("test_ref");
1345 builder.add_input(file.path()).unwrap();
1346 let dist = builder.build().unwrap();
1347
1348 assert_eq!(dist.contigs.len(), 2);
1349 assert_eq!(dist.contigs[0].name, "chr1");
1350 assert_eq!(dist.contigs[0].md5, "6aef897c3d6ff0c78aff06ac189178dd");
1351 assert_eq!(dist.contigs[1].name, "chr2");
1352 assert_eq!(dist.contigs[1].md5, "f98db672eb0993dcfdabafe2a882905c");
1353 }
1354
1355 #[test]
1356 fn test_distribution_builder_merges_inputs() {
1357 use std::io::Write;
1358 use tempfile::NamedTempFile;
1359
1360 let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1362 writeln!(dict, "@HD\tVN:1.6").unwrap();
1363 writeln!(dict, "@SQ\tSN:chr1\tLN:1000").unwrap();
1364
1365 let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1367 writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1368 writeln!(
1369 vcf,
1370 "##contig=<ID=chr1,length=1000,md5=6aef897c3d6ff0c78aff06ac189178dd>"
1371 )
1372 .unwrap();
1373 writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1374
1375 let mut builder = DistributionBuilder::new("test_ref");
1376 builder.add_input(dict.path()).unwrap();
1377 builder.add_input(vcf.path()).unwrap();
1378 let distribution = builder.build().unwrap();
1379
1380 assert_eq!(distribution.contigs.len(), 1);
1381 assert_eq!(
1382 distribution.contigs[0].md5,
1383 "6aef897c3d6ff0c78aff06ac189178dd"
1384 );
1385 }
1386
1387 #[test]
1388 fn test_distribution_builder_md5_conflict() {
1389 use std::io::Write;
1390 use tempfile::NamedTempFile;
1391
1392 let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1393 writeln!(dict, "@HD\tVN:1.6").unwrap();
1394 writeln!(
1395 dict,
1396 "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1397 )
1398 .unwrap();
1399
1400 let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1401 writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1402 writeln!(
1403 vcf,
1404 "##contig=<ID=chr1,length=1000,md5=f98db672eb0993dcfdabafe2a882905c>"
1405 )
1406 .unwrap();
1407 writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1408
1409 let mut builder = DistributionBuilder::new("test_ref");
1410 builder.add_input(dict.path()).unwrap();
1411 let result = builder.add_input(vcf.path());
1412
1413 assert!(matches!(result, Err(BuilderError::Merge(_))));
1414 }
1415
1416 #[test]
1417 fn test_distribution_builder_preserves_order() {
1418 use std::io::Write;
1419 use tempfile::NamedTempFile;
1420
1421 let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1422 writeln!(file, "@HD\tVN:1.6").unwrap();
1423 writeln!(
1424 file,
1425 "@SQ\tSN:chrM\tLN:16569\tM5:d2ed829b8a1628d16cbeee88e88e39eb"
1426 )
1427 .unwrap();
1428 writeln!(
1429 file,
1430 "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1431 )
1432 .unwrap();
1433 writeln!(
1434 file,
1435 "@SQ\tSN:chr2\tLN:242193529\tM5:f98db672eb0993dcfdabafe2a882905c"
1436 )
1437 .unwrap();
1438
1439 let mut builder = DistributionBuilder::new("test_ref");
1440 builder.add_input(file.path()).unwrap();
1441 let dist = builder.build().unwrap();
1442
1443 assert_eq!(dist.contigs[0].name, "chrM");
1444 assert_eq!(dist.contigs[0].sort_order, 0);
1445 assert_eq!(dist.contigs[1].name, "chr1");
1446 assert_eq!(dist.contigs[1].sort_order, 1);
1447 assert_eq!(dist.contigs[2].name, "chr2");
1448 assert_eq!(dist.contigs[2].sort_order, 2);
1449 }
1450}