1use std::collections::{HashMap, HashSet};
8use std::path::Path;
9use thiserror::Error;
10
11use crate::core::assembly::{ContigMergeError, FastaContig, FastaDistribution};
12use crate::core::contig::{Contig, SequenceRole};
13use crate::core::reference::KnownReference;
14use crate::core::types::{Assembly, ReferenceSource};
15use crate::utils::validation::{is_valid_md5, is_valid_sha512t24u};
16
17#[derive(Error, Debug)]
18#[non_exhaustive]
19pub enum BuilderError {
20 #[error("IO error: {0}")]
21 Io(#[from] std::io::Error),
22
23 #[error("Parse error: {0}")]
24 Parse(String),
25
26 #[error("Conflict: {0}")]
27 Conflict(String),
28
29 #[error("Validation error: {0}")]
30 Validation(String),
31
32 #[error("Missing required field: {0}")]
33 MissingField(String),
34
35 #[error("Merge error: {0}")]
36 Merge(#[from] ContigMergeError),
37}
38
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum InputFormat {
42 Dict,
43 Fai,
44 Fasta,
45 NcbiReport,
46 Sam,
47 Bam,
48 Cram,
49 Vcf,
50 Tsv,
51}
52
53impl InputFormat {
54 #[must_use]
56 pub fn from_path(path: &Path) -> Option<Self> {
57 let name = path.file_name()?.to_str()?;
58 let name_lower = name.to_lowercase();
59
60 #[allow(clippy::case_sensitive_file_extension_comparisons)]
63 if name_lower.contains("_assembly_report") && name_lower.ends_with(".txt") {
64 return Some(Self::NcbiReport);
65 }
66
67 if name_lower.ends_with(".fa.gz")
69 || name_lower.ends_with(".fasta.gz")
70 || name_lower.ends_with(".fna.gz")
71 || name_lower.ends_with(".fa.bgz")
72 || name_lower.ends_with(".fasta.bgz")
73 || name_lower.ends_with(".fna.bgz")
74 {
75 return Some(Self::Fasta);
76 }
77
78 let ext = path.extension()?.to_str()?.to_lowercase();
79 match ext.as_str() {
80 "dict" => Some(Self::Dict),
81 "fai" => Some(Self::Fai),
82 "fa" | "fasta" | "fna" => Some(Self::Fasta),
83 "sam" => Some(Self::Sam),
84 "bam" => Some(Self::Bam),
85 "cram" => Some(Self::Cram),
86 "vcf" => Some(Self::Vcf),
87 "tsv" | "txt" => Some(Self::Tsv),
88 "gz" => {
89 let stem = path.file_stem()?.to_str()?.to_lowercase();
91 #[allow(clippy::case_sensitive_file_extension_comparisons)]
92 if stem.ends_with(".vcf") {
93 Some(Self::Vcf)
94 } else {
95 None
96 }
97 }
98 _ => None,
99 }
100 }
101}
102
103#[derive(Debug, Clone)]
105#[non_exhaustive]
106pub struct ContigMetadata {
107 pub primary_name: String,
109
110 pub length: Option<u64>,
112
113 pub md5: Option<String>,
115
116 pub sha512t24u: Option<String>,
118
119 pub aliases: HashSet<String>,
121
122 pub assembly: Option<String>,
124
125 pub uri: Option<String>,
127
128 pub species: Option<String>,
130
131 pub sequence_role: SequenceRole,
133
134 pub sources: Vec<String>,
136}
137
138impl ContigMetadata {
139 fn new(name: String) -> Self {
140 Self {
141 primary_name: name,
142 length: None,
143 md5: None,
144 sha512t24u: None,
145 aliases: HashSet::new(),
146 assembly: None,
147 uri: None,
148 species: None,
149 sequence_role: SequenceRole::Unknown,
150 sources: Vec::new(),
151 }
152 }
153
154 fn to_contig(&self) -> Option<Contig> {
156 let length = self.length?;
157 let mut contig = Contig::new(&self.primary_name, length);
158 contig.md5.clone_from(&self.md5);
159 contig.sha512t24u.clone_from(&self.sha512t24u);
160 contig.assembly.clone_from(&self.assembly);
161 contig.uri.clone_from(&self.uri);
162 contig.species.clone_from(&self.species);
163 contig.aliases = self.aliases.iter().cloned().collect();
164 contig.sequence_role = self.sequence_role;
165 Some(contig)
166 }
167}
168
169#[derive(Debug, Clone)]
171pub struct InputRecord {
172 pub path: String,
173 pub format: InputFormat,
174 pub contigs_found: usize,
175 pub contigs_merged: usize,
176 pub aliases_added: usize,
177}
178
179pub struct ReferenceBuilder {
181 id: String,
182 display_name: String,
183 assembly: Option<Assembly>,
184 source: Option<ReferenceSource>,
185 description: Option<String>,
186 download_url: Option<String>,
187 assembly_report_url: Option<String>,
188 tags: Vec<String>,
189
190 contigs: HashMap<String, ContigMetadata>,
192
193 contig_order: Vec<String>,
195
196 alias_to_primary: HashMap<String, String>,
198
199 inputs_processed: Vec<InputRecord>,
201
202 conflicts: Vec<String>,
204
205 warnings: Vec<String>,
207
208 species: Option<String>,
211
212 generate_ucsc_names: bool,
227}
228
229impl ReferenceBuilder {
230 pub fn new(id: impl Into<String>, display_name: impl Into<String>) -> Self {
232 Self {
233 id: id.into(),
234 display_name: display_name.into(),
235 assembly: None,
236 source: None,
237 description: None,
238 download_url: None,
239 assembly_report_url: None,
240 tags: Vec::new(),
241 contigs: HashMap::new(),
242 contig_order: Vec::new(),
243 alias_to_primary: HashMap::new(),
244 inputs_processed: Vec::new(),
245 conflicts: Vec::new(),
246 warnings: Vec::new(),
247 species: None,
248 generate_ucsc_names: true, }
250 }
251
252 #[must_use]
272 pub fn generate_ucsc_names(mut self, generate: bool) -> Self {
273 self.generate_ucsc_names = generate;
274 self
275 }
276
277 #[must_use]
278 pub fn assembly(mut self, assembly: Assembly) -> Self {
279 self.assembly = Some(assembly);
280 self
281 }
282
283 #[must_use]
284 pub fn source(mut self, source: ReferenceSource) -> Self {
285 self.source = Some(source);
286 self
287 }
288
289 #[must_use]
290 pub fn description(mut self, description: impl Into<String>) -> Self {
291 self.description = Some(description.into());
292 self
293 }
294
295 #[must_use]
296 pub fn download_url(mut self, url: impl Into<String>) -> Self {
297 self.download_url = Some(url.into());
298 self
299 }
300
301 #[must_use]
302 pub fn assembly_report_url(mut self, url: impl Into<String>) -> Self {
303 self.assembly_report_url = Some(url.into());
304 self
305 }
306
307 #[must_use]
308 pub fn tags(mut self, tags: Vec<String>) -> Self {
309 self.tags = tags;
310 self
311 }
312
313 #[must_use]
318 pub fn species(mut self, species: impl Into<String>) -> Self {
319 self.species = Some(species.into());
320 self
321 }
322
323 pub fn add_input(&mut self, path: &Path) -> Result<(), BuilderError> {
330 let format = InputFormat::from_path(path).ok_or_else(|| {
331 BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
332 })?;
333 self.add_input_with_format(path, format)
334 }
335
336 pub fn add_input_with_format(
343 &mut self,
344 path: &Path,
345 format: InputFormat,
346 ) -> Result<(), BuilderError> {
347 let path_str = path.display().to_string();
348
349 match format {
350 InputFormat::Dict | InputFormat::Sam => {
351 self.add_dict_or_sam(path, &path_str, format)?;
352 }
353 InputFormat::Bam => {
354 self.add_bam(path, &path_str)?;
355 }
356 InputFormat::Cram => {
357 self.add_cram(path, &path_str)?;
358 }
359 InputFormat::Fai => {
360 self.add_fai(path, &path_str)?;
361 }
362 InputFormat::Fasta => {
363 self.add_fasta(path, &path_str)?;
364 }
365 InputFormat::NcbiReport => {
366 self.add_ncbi_report(path, &path_str)?;
367 }
368 InputFormat::Vcf => {
369 self.add_vcf(path, &path_str)?;
370 }
371 InputFormat::Tsv => {
372 self.add_tsv(path, &path_str)?;
373 }
374 }
375
376 Ok(())
377 }
378
379 fn add_dict_or_sam(
380 &mut self,
381 path: &Path,
382 path_str: &str,
383 format: InputFormat,
384 ) -> Result<(), BuilderError> {
385 let content = std::fs::read_to_string(path)?;
386 let query = crate::parsing::sam::parse_header_text(&content)
387 .map_err(|e| BuilderError::Parse(e.to_string()))?;
388
389 let mut record = InputRecord {
390 path: path_str.to_string(),
391 format,
392 contigs_found: query.contigs.len(),
393 contigs_merged: 0,
394 aliases_added: 0,
395 };
396
397 for contig in query.contigs {
398 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
399 if merged {
400 record.contigs_merged += 1;
401 }
402 record.aliases_added += aliases;
403 }
404
405 self.inputs_processed.push(record);
406 Ok(())
407 }
408
409 fn add_bam(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
410 let query = crate::parsing::sam::parse_file(path)
411 .map_err(|e| BuilderError::Parse(e.to_string()))?;
412
413 let mut record = InputRecord {
414 path: path_str.to_string(),
415 format: InputFormat::Bam,
416 contigs_found: query.contigs.len(),
417 contigs_merged: 0,
418 aliases_added: 0,
419 };
420
421 for contig in query.contigs {
422 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
423 if merged {
424 record.contigs_merged += 1;
425 }
426 record.aliases_added += aliases;
427 }
428
429 self.inputs_processed.push(record);
430 Ok(())
431 }
432
433 fn add_cram(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
434 let query = crate::parsing::sam::parse_file(path)
435 .map_err(|e| BuilderError::Parse(e.to_string()))?;
436
437 let mut record = InputRecord {
438 path: path_str.to_string(),
439 format: InputFormat::Cram,
440 contigs_found: query.contigs.len(),
441 contigs_merged: 0,
442 aliases_added: 0,
443 };
444
445 for contig in query.contigs {
446 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
447 if merged {
448 record.contigs_merged += 1;
449 }
450 record.aliases_added += aliases;
451 }
452
453 self.inputs_processed.push(record);
454 Ok(())
455 }
456
457 fn add_fai(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
458 let content = std::fs::read_to_string(path)?;
459 let query = crate::parsing::fai::parse_fai_text(&content)
460 .map_err(|e| BuilderError::Parse(e.to_string()))?;
461
462 let mut record = InputRecord {
463 path: path_str.to_string(),
464 format: InputFormat::Fai,
465 contigs_found: query.contigs.len(),
466 contigs_merged: 0,
467 aliases_added: 0,
468 };
469
470 for contig in query.contigs {
471 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
472 if merged {
473 record.contigs_merged += 1;
474 }
475 record.aliases_added += aliases;
476 }
477
478 self.inputs_processed.push(record);
479 Ok(())
480 }
481
482 fn add_ncbi_report(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
483 let content = std::fs::read_to_string(path)?;
484 let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
485 .map_err(|e| BuilderError::Parse(e.to_string()))?;
486
487 let mut record = InputRecord {
488 path: path_str.to_string(),
489 format: InputFormat::NcbiReport,
490 contigs_found: entries.len(),
491 contigs_merged: 0,
492 aliases_added: 0,
493 };
494
495 for entry in entries {
496 let contig = entry.to_contig_with_options(self.generate_ucsc_names);
498 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
499 if merged {
500 record.contigs_merged += 1;
501 }
502 record.aliases_added += aliases;
503 }
504
505 self.inputs_processed.push(record);
506 Ok(())
507 }
508
509 fn add_vcf(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
510 let content = std::fs::read_to_string(path)?;
511 let query = crate::parsing::vcf::parse_vcf_header_text(&content)
512 .map_err(|e| BuilderError::Parse(e.to_string()))?;
513
514 let mut record = InputRecord {
515 path: path_str.to_string(),
516 format: InputFormat::Vcf,
517 contigs_found: query.contigs.len(),
518 contigs_merged: 0,
519 aliases_added: 0,
520 };
521
522 for contig in query.contigs {
523 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
524 if merged {
525 record.contigs_merged += 1;
526 }
527 record.aliases_added += aliases;
528 }
529
530 self.inputs_processed.push(record);
531 Ok(())
532 }
533
534 fn add_tsv(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
535 let content = std::fs::read_to_string(path)?;
536 let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
537 .map_err(|e| BuilderError::Parse(e.to_string()))?;
538
539 let mut record = InputRecord {
540 path: path_str.to_string(),
541 format: InputFormat::Tsv,
542 contigs_found: query.contigs.len(),
543 contigs_merged: 0,
544 aliases_added: 0,
545 };
546
547 for contig in query.contigs {
548 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
549 if merged {
550 record.contigs_merged += 1;
551 }
552 record.aliases_added += aliases;
553 }
554
555 self.inputs_processed.push(record);
556 Ok(())
557 }
558
559 fn add_fasta(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
560 let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
561 .map_err(|e| BuilderError::Parse(e.to_string()))?;
562
563 let mut record = InputRecord {
564 path: path_str.to_string(),
565 format: InputFormat::Fasta,
566 contigs_found: query.contigs.len(),
567 contigs_merged: 0,
568 aliases_added: 0,
569 };
570
571 for contig in query.contigs {
572 let (merged, aliases) = self.merge_contig(&contig, path_str)?;
573 if merged {
574 record.contigs_merged += 1;
575 }
576 record.aliases_added += aliases;
577 }
578
579 self.inputs_processed.push(record);
580 Ok(())
581 }
582
583 fn check_digest_conflict(
588 existing: &mut Option<String>,
589 incoming: Option<&String>,
590 field_name: &str,
591 contig_name: &str,
592 source: &str,
593 conflicts: &mut Vec<String>,
594 validator: fn(&str) -> bool,
595 ) -> Result<(), BuilderError> {
596 if let Some(new_val) = incoming {
597 if !validator(new_val) {
598 let msg = format!(
599 "Invalid {field_name} for '{contig_name}': '{new_val}' (from {source})"
600 );
601 return Err(BuilderError::Validation(msg));
602 }
603 if let Some(existing_val) = existing.as_ref() {
604 if existing_val != new_val {
605 let msg = format!(
606 "{field_name} conflict for '{contig_name}': {existing_val} vs {new_val} (from {source})"
607 );
608 conflicts.push(msg.clone());
609 return Err(BuilderError::Conflict(msg));
610 }
611 } else {
612 *existing = Some(new_val.clone());
613 }
614 }
615 Ok(())
616 }
617
618 fn find_primary_for_contig(&self, contig: &Contig) -> Option<String> {
620 std::iter::once(&contig.name)
621 .chain(contig.aliases.iter())
622 .find_map(|name| self.find_existing_primary(name))
623 }
624
625 fn merge_contig(
628 &mut self,
629 contig: &Contig,
630 source: &str,
631 ) -> Result<(bool, usize), BuilderError> {
632 if let Some(primary) = self.find_primary_for_contig(contig) {
633 let metadata = self.contigs.get_mut(&primary).unwrap();
635 let aliases_before = metadata.aliases.len();
636
637 if let Some(existing_len) = metadata.length {
639 if existing_len != contig.length {
640 let msg = format!(
641 "Length conflict for '{}': {} vs {} (from {})",
642 contig.name, existing_len, contig.length, source
643 );
644 self.conflicts.push(msg.clone());
645 return Err(BuilderError::Conflict(msg));
646 }
647 } else {
648 metadata.length = Some(contig.length);
649 }
650
651 Self::check_digest_conflict(
653 &mut metadata.md5,
654 contig.md5.as_ref(),
655 "MD5",
656 &contig.name,
657 source,
658 &mut self.conflicts,
659 is_valid_md5,
660 )?;
661 Self::check_digest_conflict(
662 &mut metadata.sha512t24u,
663 contig.sha512t24u.as_ref(),
664 "sha512t24u",
665 &contig.name,
666 source,
667 &mut self.conflicts,
668 is_valid_sha512t24u,
669 )?;
670
671 for alias in &contig.aliases {
673 if !metadata.aliases.contains(alias) && alias != &metadata.primary_name {
674 if let Some(other_primary) = self.alias_to_primary.get(alias) {
676 if other_primary != &primary {
677 self.warnings.push(format!(
678 "Alias '{alias}' already mapped to '{other_primary}', skipping for '{primary}'"
679 ));
680 continue;
681 }
682 }
683 metadata.aliases.insert(alias.clone());
684 self.alias_to_primary.insert(alias.clone(), primary.clone());
685 }
686 }
687
688 if contig.name != primary && !metadata.aliases.contains(&contig.name) {
690 metadata.aliases.insert(contig.name.clone());
691 self.alias_to_primary
692 .insert(contig.name.clone(), primary.clone());
693 }
694
695 if metadata.assembly.is_none() && contig.assembly.is_some() {
697 metadata.assembly.clone_from(&contig.assembly);
698 }
699 if metadata.uri.is_none() && contig.uri.is_some() {
700 metadata.uri.clone_from(&contig.uri);
701 }
702 if metadata.species.is_none() && contig.species.is_some() {
703 metadata.species.clone_from(&contig.species);
704 }
705 if matches!(metadata.sequence_role, SequenceRole::Unknown) {
707 metadata.sequence_role = contig.sequence_role;
708 }
709
710 metadata.sources.push(source.to_string());
711
712 let aliases_added = metadata.aliases.len() - aliases_before;
713 Ok((true, aliases_added))
714 } else {
715 let mut metadata = ContigMetadata::new(contig.name.clone());
717 metadata.length = Some(contig.length);
718 metadata.md5.clone_from(&contig.md5);
719 metadata.sha512t24u.clone_from(&contig.sha512t24u);
720 metadata.assembly.clone_from(&contig.assembly);
721 metadata.uri.clone_from(&contig.uri);
722 metadata.species.clone_from(&contig.species);
723 metadata.sequence_role = contig.sequence_role;
724 metadata.sources.push(source.to_string());
725
726 let mut aliases_added = 0;
728 for alias in &contig.aliases {
729 if alias != &contig.name {
730 if let Some(other_primary) = self.alias_to_primary.get(alias) {
731 self.warnings.push(format!(
732 "Alias '{}' already mapped to '{}', skipping for '{}'",
733 alias, other_primary, contig.name
734 ));
735 continue;
736 }
737 metadata.aliases.insert(alias.clone());
738 self.alias_to_primary
739 .insert(alias.clone(), contig.name.clone());
740 aliases_added += 1;
741 }
742 }
743
744 self.contig_order.push(contig.name.clone());
745 self.contigs.insert(contig.name.clone(), metadata);
746
747 Ok((false, aliases_added))
748 }
749 }
750
751 fn find_existing_primary(&self, name: &str) -> Option<String> {
753 if self.contigs.contains_key(name) {
755 return Some(name.to_string());
756 }
757
758 if let Some(primary) = self.alias_to_primary.get(name) {
760 return Some(primary.clone());
761 }
762
763 None
764 }
765
766 pub fn build(self) -> Result<KnownReference, BuilderError> {
773 if self.contigs.is_empty() {
775 return Err(BuilderError::MissingField("No contigs added".to_string()));
776 }
777
778 let mut missing_length = Vec::new();
780 for name in &self.contig_order {
781 if let Some(metadata) = self.contigs.get(name) {
782 if metadata.length.is_none() {
783 missing_length.push(name.clone());
784 }
785 }
786 }
787 if !missing_length.is_empty() {
788 return Err(BuilderError::MissingField(format!(
789 "Missing length for contigs: {missing_length:?}"
790 )));
791 }
792
793 let mut contigs: Vec<Contig> = self
795 .contig_order
796 .iter()
797 .filter_map(|name| self.contigs.get(name))
798 .filter_map(ContigMetadata::to_contig)
799 .collect();
800
801 if let Some(ref url) = self.download_url {
805 for contig in &mut contigs {
806 contig.uri = Some(url.clone());
807 }
808 }
809
810 if let Some(ref assembly) = self.assembly {
814 let assembly_str = assembly.to_string();
815 for contig in &mut contigs {
816 contig.assembly = Some(assembly_str.clone());
817 }
818 }
819
820 if let Some(ref species) = self.species {
824 for contig in &mut contigs {
825 contig.species = Some(species.clone());
826 }
827 }
828
829 let mut contigs_missing_from_fasta: Vec<String> = Vec::new();
832 for (name, meta) in &self.contigs {
833 if meta.md5.is_none() && matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
834 contigs_missing_from_fasta.push(name.clone());
835 }
836 }
837 contigs_missing_from_fasta.sort();
838
839 let assembly = self
841 .assembly
842 .unwrap_or_else(|| detect_assembly_from_name(&self.display_name));
843
844 let source = self
846 .source
847 .unwrap_or(ReferenceSource::Custom("Unknown".to_string()));
848
849 let naming_convention = crate::core::contig::detect_naming_convention(&contigs);
851
852 let mut reference = KnownReference {
853 id: crate::core::types::ReferenceId::new(&self.id),
854 display_name: self.display_name,
855 assembly,
856 source,
857 naming_convention,
858 download_url: self.download_url,
859 assembly_report_url: self.assembly_report_url,
860 contigs,
861 description: self.description,
862 tags: self.tags,
863 contigs_missing_from_fasta,
864 md5_set: HashSet::new(),
865 sha512t24u_set: HashSet::new(),
866 name_length_set: HashSet::new(),
867 signature: None,
868 };
869
870 reference.rebuild_indexes();
871 Ok(reference)
872 }
873
874 #[must_use]
876 pub fn summary(&self) -> BuildSummary {
877 let total_contigs = self.contigs.len();
878 let with_length = self.contigs.values().filter(|m| m.length.is_some()).count();
879 let with_md5 = self.contigs.values().filter(|m| m.md5.is_some()).count();
880 let with_aliases = self
881 .contigs
882 .values()
883 .filter(|m| !m.aliases.is_empty())
884 .count();
885
886 let mut missing_md5_assembled: Vec<String> = Vec::new();
888 for (name, meta) in &self.contigs {
889 if meta.md5.is_none() {
890 if matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
892 missing_md5_assembled.push(name.clone());
893 }
894 }
895 }
896
897 let primary_count = self
899 .contigs
900 .keys()
901 .filter(|name| {
902 let contig = Contig::new(name.as_str(), 0);
903 contig.is_primary_chromosome()
904 })
905 .count();
906
907 let alt_count = self
909 .contigs
910 .keys()
911 .filter(|name| name.ends_with("_alt") || name.contains("_alt_"))
912 .count();
913
914 BuildSummary {
915 id: self.id.clone(),
916 display_name: self.display_name.clone(),
917 assembly: self.assembly.clone(),
918 source: self.source.clone(),
919 inputs: self.inputs_processed.clone(),
920 total_contigs,
921 primary_chromosomes: primary_count,
922 alt_contigs: alt_count,
923 other_contigs: total_contigs.saturating_sub(primary_count + alt_count),
924 with_length,
925 with_md5,
926 with_aliases,
927 missing_md5_assembled,
928 conflicts: self.conflicts.clone(),
929 warnings: self.warnings.clone(),
930 }
931 }
932}
933
934#[derive(Debug, Clone)]
936pub struct BuildSummary {
937 pub id: String,
938 pub display_name: String,
939 pub assembly: Option<Assembly>,
940 pub source: Option<ReferenceSource>,
941 pub inputs: Vec<InputRecord>,
942 pub total_contigs: usize,
943 pub primary_chromosomes: usize,
944 pub alt_contigs: usize,
945 pub other_contigs: usize,
946 pub with_length: usize,
947 pub with_md5: usize,
948 pub with_aliases: usize,
949 pub missing_md5_assembled: Vec<String>,
950 pub conflicts: Vec<String>,
951 pub warnings: Vec<String>,
952}
953
954impl std::fmt::Display for BuildSummary {
955 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
956 writeln!(f, "Reference Builder Summary")?;
957 writeln!(f, "=========================")?;
958 writeln!(f, "ID: {}", self.id)?;
959 writeln!(f, "Name: {}", self.display_name)?;
960 if let Some(ref assembly) = self.assembly {
961 writeln!(f, "Assembly: {assembly:?}")?;
962 }
963 if let Some(ref source) = self.source {
964 writeln!(f, "Source: {source:?}")?;
965 }
966 writeln!(f)?;
967
968 writeln!(f, "Inputs:")?;
969 for (i, input) in self.inputs.iter().enumerate() {
970 writeln!(
971 f,
972 " [{}] {} ({:?}) -> {} contigs, {} merged, {} aliases",
973 i + 1,
974 input.path,
975 input.format,
976 input.contigs_found,
977 input.contigs_merged,
978 input.aliases_added
979 )?;
980 }
981 writeln!(f)?;
982
983 writeln!(f, "Contigs: {} total", self.total_contigs)?;
984 writeln!(f, " - Primary chromosomes: {}", self.primary_chromosomes)?;
985 writeln!(f, " - ALT contigs: {}", self.alt_contigs)?;
986 writeln!(f, " - Other: {}", self.other_contigs)?;
987 writeln!(f)?;
988
989 writeln!(f, "Coverage:")?;
990 let pct = |n: usize, total: usize| {
991 if total == 0 {
992 0
993 } else {
994 (n * 100) / total
995 }
996 };
997 let check = |n: usize, total: usize| {
998 if n == total {
999 "+"
1000 } else {
1001 "o"
1002 }
1003 };
1004 writeln!(
1005 f,
1006 " {} Length: {}/{} ({}%)",
1007 check(self.with_length, self.total_contigs),
1008 self.with_length,
1009 self.total_contigs,
1010 pct(self.with_length, self.total_contigs)
1011 )?;
1012 writeln!(
1013 f,
1014 " {} MD5: {}/{} ({}%)",
1015 check(self.with_md5, self.total_contigs),
1016 self.with_md5,
1017 self.total_contigs,
1018 pct(self.with_md5, self.total_contigs)
1019 )?;
1020 writeln!(
1021 f,
1022 " {} Aliases: {}/{} ({}%)",
1023 check(self.with_aliases, self.total_contigs),
1024 self.with_aliases,
1025 self.total_contigs,
1026 pct(self.with_aliases, self.total_contigs)
1027 )?;
1028 writeln!(f)?;
1029
1030 writeln!(f, "Conflicts: {}", self.conflicts.len())?;
1031 for conflict in &self.conflicts {
1032 writeln!(f, " - {conflict}")?;
1033 }
1034
1035 let total_warnings = self.warnings.len() + self.missing_md5_assembled.len();
1036 writeln!(f, "Warnings: {total_warnings}")?;
1037 for warning in &self.warnings {
1038 writeln!(f, " - {warning}")?;
1039 }
1040 if !self.missing_md5_assembled.is_empty() {
1041 writeln!(
1042 f,
1043 " - Missing MD5 for assembled-molecule contigs: {}",
1044 self.missing_md5_assembled.join(", ")
1045 )?;
1046 }
1047
1048 Ok(())
1049 }
1050}
1051
1052#[derive(Debug)]
1060pub struct DistributionBuilder {
1061 id: String,
1062 display_name: String,
1063 source: ReferenceSource,
1064 download_url: Option<String>,
1065 tags: Vec<String>,
1066
1067 contigs: HashMap<(String, u64), FastaContig>,
1069
1070 insertion_order: Vec<(String, u64)>,
1072
1073 source_files: Vec<String>,
1075
1076 generate_ucsc_names: bool,
1078}
1079
1080impl Default for DistributionBuilder {
1081 fn default() -> Self {
1082 Self::new("")
1083 }
1084}
1085
1086impl DistributionBuilder {
1087 pub fn new(id: impl Into<String>) -> Self {
1089 Self {
1090 id: id.into(),
1091 display_name: String::new(),
1092 source: ReferenceSource::Custom("custom".to_string()),
1093 download_url: None,
1094 tags: Vec::new(),
1095 contigs: HashMap::new(),
1096 insertion_order: Vec::new(),
1097 source_files: Vec::new(),
1098 generate_ucsc_names: true, }
1100 }
1101
1102 #[must_use]
1106 pub fn with_generate_ucsc_names(mut self, generate: bool) -> Self {
1107 self.generate_ucsc_names = generate;
1108 self
1109 }
1110
1111 #[must_use]
1113 pub fn with_display_name(mut self, name: impl Into<String>) -> Self {
1114 self.display_name = name.into();
1115 self
1116 }
1117
1118 #[must_use]
1120 pub fn with_source(mut self, source: ReferenceSource) -> Self {
1121 self.source = source;
1122 self
1123 }
1124
1125 #[must_use]
1127 pub fn with_download_url(mut self, url: impl Into<String>) -> Self {
1128 self.download_url = Some(url.into());
1129 self
1130 }
1131
1132 #[must_use]
1134 pub fn with_tags(mut self, tags: Vec<String>) -> Self {
1135 self.tags = tags;
1136 self
1137 }
1138
1139 pub fn add_input(&mut self, path: &Path) -> Result<&mut Self, BuilderError> {
1146 let format = InputFormat::from_path(path).ok_or_else(|| {
1147 BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
1148 })?;
1149 self.add_input_with_format(path, format)
1150 }
1151
1152 pub fn add_input_with_format(
1159 &mut self,
1160 path: &Path,
1161 format: InputFormat,
1162 ) -> Result<&mut Self, BuilderError> {
1163 let path_str = path.display().to_string();
1164 self.source_files.push(path_str.clone());
1165
1166 let contigs = self.parse_input(path, format)?;
1167
1168 for contig in contigs {
1169 let key = (contig.name.clone(), contig.length);
1170 #[allow(clippy::cast_possible_truncation)] let fasta_contig = FastaContig {
1172 name: contig.name,
1173 length: contig.length,
1174 md5: contig.md5.unwrap_or_default(),
1175 sort_order: self.insertion_order.len() as u32,
1176 report_contig_id: None,
1177 aliases: contig.aliases,
1178 };
1179
1180 if let Some(existing) = self.contigs.get_mut(&key) {
1181 existing.merge(&fasta_contig)?;
1182 } else {
1183 self.insertion_order.push(key.clone());
1184 self.contigs.insert(key, fasta_contig);
1185 }
1186 }
1187
1188 Ok(self)
1189 }
1190
1191 fn parse_input(&self, path: &Path, format: InputFormat) -> Result<Vec<Contig>, BuilderError> {
1193 match format {
1194 InputFormat::Dict | InputFormat::Sam => {
1195 let content = std::fs::read_to_string(path)?;
1196 let query = crate::parsing::sam::parse_header_text(&content)
1197 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1198 Ok(query.contigs)
1199 }
1200 InputFormat::Bam | InputFormat::Cram => {
1201 let query = crate::parsing::sam::parse_file(path)
1202 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1203 Ok(query.contigs)
1204 }
1205 InputFormat::Fai => {
1206 let content = std::fs::read_to_string(path)?;
1207 let query = crate::parsing::fai::parse_fai_text(&content)
1208 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1209 Ok(query.contigs)
1210 }
1211 InputFormat::Fasta => {
1212 let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
1214 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1215 Ok(query.contigs)
1216 }
1217 InputFormat::NcbiReport => {
1218 let content = std::fs::read_to_string(path)?;
1219 let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
1220 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1221 Ok(entries
1223 .into_iter()
1224 .map(|e| e.to_contig_with_options(self.generate_ucsc_names))
1225 .collect())
1226 }
1227 InputFormat::Vcf => {
1228 let content = std::fs::read_to_string(path)?;
1229 let query = crate::parsing::vcf::parse_vcf_header_text(&content)
1230 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1231 Ok(query.contigs)
1232 }
1233 InputFormat::Tsv => {
1234 let content = std::fs::read_to_string(path)?;
1235 let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
1236 .map_err(|e| BuilderError::Parse(e.to_string()))?;
1237 Ok(query.contigs)
1238 }
1239 }
1240 }
1241
1242 pub fn build(self) -> Result<FastaDistribution, BuilderError> {
1248 if self.contigs.is_empty() {
1249 return Err(BuilderError::MissingField("No contigs found".to_string()));
1250 }
1251
1252 let mut contigs: Vec<FastaContig> = Vec::with_capacity(self.insertion_order.len());
1254 for (i, key) in self.insertion_order.iter().enumerate() {
1255 if let Some(mut contig) = self.contigs.get(key).cloned() {
1256 #[allow(clippy::cast_possible_truncation)] {
1258 contig.sort_order = i as u32;
1259 }
1260 contigs.push(contig);
1261 }
1262 }
1263
1264 Ok(FastaDistribution {
1265 id: self.id,
1266 display_name: self.display_name,
1267 source: self.source,
1268 download_url: self.download_url,
1269 tags: self.tags,
1270 contigs,
1271 })
1272 }
1273}
1274
1275#[must_use]
1277pub fn detect_assembly_from_name(display_name: &str) -> Assembly {
1278 let lower = display_name.to_lowercase();
1279 if lower.contains("chm13") || lower.contains("t2t") {
1280 Assembly::Other("CHM13".to_string())
1281 } else if lower.contains("38") || lower.contains("hg38") || lower.contains("hs38") {
1282 Assembly::Grch38
1283 } else if lower.contains("37")
1284 || lower.contains("19")
1285 || lower.contains("hg19")
1286 || lower.contains("hs37")
1287 || lower.contains("b37")
1288 {
1289 Assembly::Grch37
1290 } else {
1291 Assembly::Other(display_name.to_string())
1292 }
1293}
1294
1295#[cfg(test)]
1296mod tests {
1297 use super::*;
1298
1299 #[test]
1300 fn test_input_format_detection() {
1301 assert_eq!(
1302 InputFormat::from_path(Path::new("test.dict")),
1303 Some(InputFormat::Dict)
1304 );
1305 assert_eq!(
1306 InputFormat::from_path(Path::new("test.fai")),
1307 Some(InputFormat::Fai)
1308 );
1309 assert_eq!(
1310 InputFormat::from_path(Path::new("test.fa")),
1311 Some(InputFormat::Fasta)
1312 );
1313 assert_eq!(
1314 InputFormat::from_path(Path::new("test.fasta")),
1315 Some(InputFormat::Fasta)
1316 );
1317 assert_eq!(
1318 InputFormat::from_path(Path::new("test.fna")),
1319 Some(InputFormat::Fasta)
1320 );
1321 assert_eq!(
1322 InputFormat::from_path(Path::new("test.fa.gz")),
1323 Some(InputFormat::Fasta)
1324 );
1325 assert_eq!(
1326 InputFormat::from_path(Path::new("test.vcf")),
1327 Some(InputFormat::Vcf)
1328 );
1329 assert_eq!(
1330 InputFormat::from_path(Path::new("test.vcf.gz")),
1331 Some(InputFormat::Vcf)
1332 );
1333 assert_eq!(
1334 InputFormat::from_path(Path::new("GRCh38_assembly_report.txt")),
1335 Some(InputFormat::NcbiReport)
1336 );
1337 }
1338
1339 #[test]
1340 fn test_detect_assembly() {
1341 assert!(matches!(
1342 detect_assembly_from_name("GRCh38 (Broad)"),
1343 Assembly::Grch38
1344 ));
1345 assert!(matches!(
1346 detect_assembly_from_name("hg38 UCSC"),
1347 Assembly::Grch38
1348 ));
1349 assert!(matches!(
1350 detect_assembly_from_name("GRCh37"),
1351 Assembly::Grch37
1352 ));
1353 assert!(matches!(
1354 detect_assembly_from_name("hg19"),
1355 Assembly::Grch37
1356 ));
1357 assert!(matches!(
1358 detect_assembly_from_name("T2T-CHM13"),
1359 Assembly::Other(_)
1360 ));
1361 }
1362
1363 #[test]
1364 fn test_builder_single_input() {
1365 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1366 .assembly(Assembly::Grch38)
1367 .source(ReferenceSource::Custom("test".to_string()));
1368
1369 let contig = Contig::new("chr1", 248_956_422);
1371 builder.merge_contig(&contig, "test").unwrap();
1372
1373 let reference = builder.build().unwrap();
1374 assert_eq!(reference.id.0, "test_ref");
1375 assert_eq!(reference.contigs.len(), 1);
1376 assert_eq!(reference.contigs[0].name, "chr1");
1377 }
1378
1379 #[test]
1380 fn test_builder_merge_with_aliases() {
1381 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1382
1383 let contig1 = Contig::new("chr1", 248_956_422);
1385 builder.merge_contig(&contig1, "source1").unwrap();
1386
1387 let mut contig2 = Contig::new("1", 248_956_422);
1389 contig2.aliases = vec!["chr1".to_string()];
1390 builder.merge_contig(&contig2, "source2").unwrap();
1391
1392 let summary = builder.summary();
1394 assert_eq!(summary.total_contigs, 1);
1395
1396 let reference = builder.build().unwrap();
1397 assert_eq!(reference.contigs.len(), 1);
1398 assert!(reference.contigs[0].aliases.contains(&"1".to_string()));
1400 }
1401
1402 #[test]
1403 fn test_builder_conflict_detection() {
1404 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1405
1406 let contig1 = Contig::new("chr1", 248_956_422);
1408 builder.merge_contig(&contig1, "source1").unwrap();
1409
1410 let contig2 = Contig::new("chr1", 100_000);
1412 let result = builder.merge_contig(&contig2, "source2");
1413 assert!(result.is_err());
1414 assert!(matches!(result.unwrap_err(), BuilderError::Conflict(_)));
1415 }
1416
1417 #[test]
1418 fn test_distribution_builder_single_dict() {
1419 use std::io::Write;
1420 use tempfile::NamedTempFile;
1421
1422 let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1423 writeln!(file, "@HD\tVN:1.6").unwrap();
1424 writeln!(
1425 file,
1426 "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1427 )
1428 .unwrap();
1429 writeln!(
1430 file,
1431 "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c"
1432 )
1433 .unwrap();
1434
1435 let mut builder = DistributionBuilder::new("test_ref");
1436 builder.add_input(file.path()).unwrap();
1437 let dist = builder.build().unwrap();
1438
1439 assert_eq!(dist.contigs.len(), 2);
1440 assert_eq!(dist.contigs[0].name, "chr1");
1441 assert_eq!(dist.contigs[0].md5, "6aef897c3d6ff0c78aff06ac189178dd");
1442 assert_eq!(dist.contigs[1].name, "chr2");
1443 assert_eq!(dist.contigs[1].md5, "f98db672eb0993dcfdabafe2a882905c");
1444 }
1445
1446 #[test]
1447 fn test_distribution_builder_merges_inputs() {
1448 use std::io::Write;
1449 use tempfile::NamedTempFile;
1450
1451 let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1453 writeln!(dict, "@HD\tVN:1.6").unwrap();
1454 writeln!(dict, "@SQ\tSN:chr1\tLN:1000").unwrap();
1455
1456 let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1458 writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1459 writeln!(
1460 vcf,
1461 "##contig=<ID=chr1,length=1000,md5=6aef897c3d6ff0c78aff06ac189178dd>"
1462 )
1463 .unwrap();
1464 writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1465
1466 let mut builder = DistributionBuilder::new("test_ref");
1467 builder.add_input(dict.path()).unwrap();
1468 builder.add_input(vcf.path()).unwrap();
1469 let distribution = builder.build().unwrap();
1470
1471 assert_eq!(distribution.contigs.len(), 1);
1472 assert_eq!(
1473 distribution.contigs[0].md5,
1474 "6aef897c3d6ff0c78aff06ac189178dd"
1475 );
1476 }
1477
1478 #[test]
1479 fn test_distribution_builder_md5_conflict() {
1480 use std::io::Write;
1481 use tempfile::NamedTempFile;
1482
1483 let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1484 writeln!(dict, "@HD\tVN:1.6").unwrap();
1485 writeln!(
1486 dict,
1487 "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1488 )
1489 .unwrap();
1490
1491 let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1492 writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1493 writeln!(
1494 vcf,
1495 "##contig=<ID=chr1,length=1000,md5=f98db672eb0993dcfdabafe2a882905c>"
1496 )
1497 .unwrap();
1498 writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1499
1500 let mut builder = DistributionBuilder::new("test_ref");
1501 builder.add_input(dict.path()).unwrap();
1502 let result = builder.add_input(vcf.path());
1503
1504 assert!(matches!(result, Err(BuilderError::Merge(_))));
1505 }
1506
1507 #[test]
1508 fn test_builder_download_url_overrides_contig_uri() {
1509 let local_uri = "file:///local/path/to/ref.fasta";
1510 let download_url = "https://example.com/ref.fasta";
1511
1512 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1513 .assembly(Assembly::Grch38)
1514 .source(ReferenceSource::Custom("test".to_string()))
1515 .download_url(download_url);
1516
1517 let mut contig1 = Contig::new("chr1", 248_956_422);
1519 contig1.uri = Some(local_uri.to_string());
1520 builder.merge_contig(&contig1, "test").unwrap();
1521
1522 let mut contig2 = Contig::new("chr2", 242_193_529);
1523 contig2.uri = Some(local_uri.to_string());
1524 builder.merge_contig(&contig2, "test").unwrap();
1525
1526 let reference = builder.build().unwrap();
1527
1528 assert_eq!(reference.download_url.as_deref(), Some(download_url));
1530
1531 for contig in &reference.contigs {
1533 assert_eq!(
1534 contig.uri.as_deref(),
1535 Some(download_url),
1536 "contig {} uri should be the download URL, not the local path",
1537 contig.name
1538 );
1539 }
1540 }
1541
1542 #[test]
1543 fn test_builder_no_download_url_preserves_contig_uri() {
1544 let local_uri = "file:///local/path/to/ref.fasta";
1545
1546 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1547 .assembly(Assembly::Grch38)
1548 .source(ReferenceSource::Custom("test".to_string()));
1549 let mut contig = Contig::new("chr1", 248_956_422);
1552 contig.uri = Some(local_uri.to_string());
1553 builder.merge_contig(&contig, "test").unwrap();
1554
1555 let reference = builder.build().unwrap();
1556
1557 assert_eq!(reference.contigs[0].uri.as_deref(), Some(local_uri));
1559 }
1560
1561 #[test]
1562 fn test_builder_download_url_overrides_dict_file_uri() {
1563 use std::io::Write;
1564 use tempfile::NamedTempFile;
1565
1566 let download_url = "https://storage.googleapis.com/bucket/ref.fasta";
1567
1568 let mut dict_file = NamedTempFile::with_suffix(".dict").unwrap();
1570 writeln!(dict_file, "@HD\tVN:1.6").unwrap();
1571 writeln!(
1572 dict_file,
1573 "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd\tUR:file:///local/ref.fa"
1574 )
1575 .unwrap();
1576 writeln!(
1577 dict_file,
1578 "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c\tUR:file:///local/ref.fa"
1579 )
1580 .unwrap();
1581
1582 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1583 .assembly(Assembly::Grch38)
1584 .source(ReferenceSource::Custom("test".to_string()))
1585 .download_url(download_url);
1586
1587 builder.add_input(dict_file.path()).unwrap();
1588
1589 let reference = builder.build().unwrap();
1590
1591 assert_eq!(reference.download_url.as_deref(), Some(download_url));
1592 for contig in &reference.contigs {
1593 assert_eq!(
1594 contig.uri.as_deref(),
1595 Some(download_url),
1596 "contig {} uri should be overridden by download_url",
1597 contig.name
1598 );
1599 }
1600 }
1601
1602 #[test]
1603 fn test_distribution_builder_preserves_order() {
1604 use std::io::Write;
1605 use tempfile::NamedTempFile;
1606
1607 let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1608 writeln!(file, "@HD\tVN:1.6").unwrap();
1609 writeln!(
1610 file,
1611 "@SQ\tSN:chrM\tLN:16569\tM5:d2ed829b8a1628d16cbeee88e88e39eb"
1612 )
1613 .unwrap();
1614 writeln!(
1615 file,
1616 "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1617 )
1618 .unwrap();
1619 writeln!(
1620 file,
1621 "@SQ\tSN:chr2\tLN:242193529\tM5:f98db672eb0993dcfdabafe2a882905c"
1622 )
1623 .unwrap();
1624
1625 let mut builder = DistributionBuilder::new("test_ref");
1626 builder.add_input(file.path()).unwrap();
1627 let dist = builder.build().unwrap();
1628
1629 assert_eq!(dist.contigs[0].name, "chrM");
1630 assert_eq!(dist.contigs[0].sort_order, 0);
1631 assert_eq!(dist.contigs[1].name, "chr1");
1632 assert_eq!(dist.contigs[1].sort_order, 1);
1633 assert_eq!(dist.contigs[2].name, "chr2");
1634 assert_eq!(dist.contigs[2].sort_order, 2);
1635 }
1636
1637 #[test]
1638 fn test_builder_assembly_overrides_contig_assembly() {
1639 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1640 .assembly(Assembly::Grch38)
1641 .source(ReferenceSource::Custom("test".to_string()));
1642
1643 let mut contig1 = Contig::new("chr1", 248_956_422);
1645 contig1.assembly = Some("hg38".to_string());
1646 builder.merge_contig(&contig1, "test").unwrap();
1647
1648 let mut contig2 = Contig::new("chr2", 242_193_529);
1649 contig2.assembly = Some("hg38".to_string());
1650 builder.merge_contig(&contig2, "test").unwrap();
1651
1652 let reference = builder.build().unwrap();
1653
1654 for contig in &reference.contigs {
1656 assert_eq!(
1657 contig.assembly.as_deref(),
1658 Some("GRCh38"),
1659 "contig {} assembly should be overridden by builder assembly",
1660 contig.name
1661 );
1662 }
1663 }
1664
1665 #[test]
1666 fn test_builder_no_assembly_preserves_contig_assembly() {
1667 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1668 .source(ReferenceSource::Custom("test".to_string()));
1669 let mut contig = Contig::new("chr1", 248_956_422);
1672 contig.assembly = Some("hg38".to_string());
1673 builder.merge_contig(&contig, "test").unwrap();
1674
1675 let reference = builder.build().unwrap();
1676
1677 assert_eq!(reference.contigs[0].assembly.as_deref(), Some("hg38"));
1679 }
1680
1681 #[test]
1682 fn test_builder_species_overrides_contig_species() {
1683 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1684 .assembly(Assembly::Grch38)
1685 .source(ReferenceSource::Custom("test".to_string()))
1686 .species("Homo sapiens");
1687
1688 let mut contig1 = Contig::new("chr1", 248_956_422);
1690 contig1.species = Some("Human".to_string());
1691 builder.merge_contig(&contig1, "test").unwrap();
1692
1693 let contig2 = Contig::new("chr2", 242_193_529);
1694 builder.merge_contig(&contig2, "test").unwrap();
1696
1697 let reference = builder.build().unwrap();
1698
1699 for contig in &reference.contigs {
1701 assert_eq!(
1702 contig.species.as_deref(),
1703 Some("Homo sapiens"),
1704 "contig {} species should be overridden by builder species",
1705 contig.name
1706 );
1707 }
1708 }
1709
1710 #[test]
1711 fn test_builder_no_species_preserves_contig_species() {
1712 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1713 .assembly(Assembly::Grch38)
1714 .source(ReferenceSource::Custom("test".to_string()));
1715 let mut contig = Contig::new("chr1", 248_956_422);
1718 contig.species = Some("Human".to_string());
1719 builder.merge_contig(&contig, "test").unwrap();
1720
1721 let reference = builder.build().unwrap();
1722
1723 assert_eq!(reference.contigs[0].species.as_deref(), Some("Human"));
1725 }
1726
1727 #[test]
1728 fn test_builder_assembly_overrides_dict_file_as_tag() {
1729 use std::io::Write;
1730 use tempfile::NamedTempFile;
1731
1732 let mut dict_file = NamedTempFile::with_suffix(".dict").unwrap();
1734 writeln!(dict_file, "@HD\tVN:1.6").unwrap();
1735 writeln!(
1736 dict_file,
1737 "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd\tAS:hg38\tSP:Human"
1738 )
1739 .unwrap();
1740 writeln!(
1741 dict_file,
1742 "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c\tAS:hg38\tSP:Human"
1743 )
1744 .unwrap();
1745
1746 let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1747 .assembly(Assembly::Grch38)
1748 .source(ReferenceSource::Custom("test".to_string()))
1749 .species("Homo sapiens");
1750
1751 builder.add_input(dict_file.path()).unwrap();
1752
1753 let reference = builder.build().unwrap();
1754
1755 for contig in &reference.contigs {
1756 assert_eq!(
1757 contig.assembly.as_deref(),
1758 Some("GRCh38"),
1759 "contig {} assembly should be overridden by builder assembly",
1760 contig.name
1761 );
1762 assert_eq!(
1763 contig.species.as_deref(),
1764 Some("Homo sapiens"),
1765 "contig {} species should be overridden by builder species",
1766 contig.name
1767 );
1768 }
1769 }
1770
1771 #[test]
1772 fn test_builder_chm13_assembly_display() {
1773 let mut builder = ReferenceBuilder::new("chm13", "T2T-CHM13v2.0")
1774 .assembly(Assembly::Other("T2T-CHM13v2.0".to_string()))
1775 .source(ReferenceSource::Custom("test".to_string()));
1776
1777 let contig = Contig::new("chr1", 248_387_328);
1778 builder.merge_contig(&contig, "test").unwrap();
1779
1780 let reference = builder.build().unwrap();
1781
1782 assert_eq!(
1783 reference.contigs[0].assembly.as_deref(),
1784 Some("T2T-CHM13v2.0")
1785 );
1786 }
1787}