Skip to main content

ref_solver/catalog/
builder.rs

1//! Reference builder for creating catalog entries from multiple input sources.
2//!
3//! The `ReferenceBuilder` collates metadata from multiple input files (dict, FAI,
4//! NCBI assembly reports, VCF headers, etc.) and produces a `KnownReference` with
5//! all contigs and aliases properly merged.
6
7use std::collections::{HashMap, HashSet};
8use std::path::Path;
9use thiserror::Error;
10
11use crate::core::assembly::{ContigMergeError, FastaContig, FastaDistribution};
12use crate::core::contig::{Contig, SequenceRole};
13use crate::core::reference::KnownReference;
14use crate::core::types::{Assembly, ReferenceSource};
15
16#[derive(Error, Debug)]
17pub enum BuilderError {
18    #[error("IO error: {0}")]
19    Io(#[from] std::io::Error),
20
21    #[error("Parse error: {0}")]
22    Parse(String),
23
24    #[error("Conflict: {0}")]
25    Conflict(String),
26
27    #[error("Missing required field: {0}")]
28    MissingField(String),
29
30    #[error("Merge error: {0}")]
31    Merge(#[from] ContigMergeError),
32}
33
34/// Input format for auto-detection
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum InputFormat {
37    Dict,
38    Fai,
39    Fasta,
40    NcbiReport,
41    Sam,
42    Bam,
43    Cram,
44    Vcf,
45    Tsv,
46}
47
48impl InputFormat {
49    /// Detect format from file extension
50    #[must_use]
51    pub fn from_path(path: &Path) -> Option<Self> {
52        let name = path.file_name()?.to_str()?;
53        let name_lower = name.to_lowercase();
54
55        // Check for NCBI assembly report pattern first
56        // name_lower is already lowercase, so case-sensitive check is fine
57        #[allow(clippy::case_sensitive_file_extension_comparisons)]
58        if name_lower.contains("_assembly_report") && name_lower.ends_with(".txt") {
59            return Some(Self::NcbiReport);
60        }
61
62        // Check for gzipped FASTA files (.fa.gz, .fasta.gz, .fna.gz)
63        if name_lower.ends_with(".fa.gz")
64            || name_lower.ends_with(".fasta.gz")
65            || name_lower.ends_with(".fna.gz")
66            || name_lower.ends_with(".fa.bgz")
67            || name_lower.ends_with(".fasta.bgz")
68            || name_lower.ends_with(".fna.bgz")
69        {
70            return Some(Self::Fasta);
71        }
72
73        let ext = path.extension()?.to_str()?.to_lowercase();
74        match ext.as_str() {
75            "dict" => Some(Self::Dict),
76            "fai" => Some(Self::Fai),
77            "fa" | "fasta" | "fna" => Some(Self::Fasta),
78            "sam" => Some(Self::Sam),
79            "bam" => Some(Self::Bam),
80            "cram" => Some(Self::Cram),
81            "vcf" => Some(Self::Vcf),
82            "tsv" | "txt" => Some(Self::Tsv),
83            "gz" => {
84                // Check for .vcf.gz (stem is lowercased)
85                let stem = path.file_stem()?.to_str()?.to_lowercase();
86                #[allow(clippy::case_sensitive_file_extension_comparisons)]
87                if stem.ends_with(".vcf") {
88                    Some(Self::Vcf)
89                } else {
90                    None
91                }
92            }
93            _ => None,
94        }
95    }
96}
97
98/// Metadata for a single contig, collected from multiple sources
99#[derive(Debug, Clone)]
100pub struct ContigMetadata {
101    /// Primary name (exact, from first source)
102    pub primary_name: String,
103
104    /// Length (must be consistent across sources)
105    pub length: Option<u64>,
106
107    /// MD5 checksum
108    pub md5: Option<String>,
109
110    /// Explicit aliases (from AN tag or NCBI report)
111    pub aliases: HashSet<String>,
112
113    /// Assembly tag (AS)
114    pub assembly: Option<String>,
115
116    /// URI (UR)
117    pub uri: Option<String>,
118
119    /// Species (SP)
120    pub species: Option<String>,
121
122    /// Sequence role from NCBI assembly report
123    pub sequence_role: SequenceRole,
124
125    /// Sources that contributed to this contig
126    pub sources: Vec<String>,
127}
128
129impl ContigMetadata {
130    fn new(name: String) -> Self {
131        Self {
132            primary_name: name,
133            length: None,
134            md5: None,
135            aliases: HashSet::new(),
136            assembly: None,
137            uri: None,
138            species: None,
139            sequence_role: SequenceRole::Unknown,
140            sources: Vec::new(),
141        }
142    }
143
144    /// Convert to a Contig
145    fn to_contig(&self) -> Option<Contig> {
146        let length = self.length?;
147        let mut contig = Contig::new(&self.primary_name, length);
148        contig.md5.clone_from(&self.md5);
149        contig.assembly.clone_from(&self.assembly);
150        contig.uri.clone_from(&self.uri);
151        contig.species.clone_from(&self.species);
152        contig.aliases = self.aliases.iter().cloned().collect();
153        contig.sequence_role = self.sequence_role;
154        Some(contig)
155    }
156}
157
158/// Record of a processed input file
159#[derive(Debug, Clone)]
160pub struct InputRecord {
161    pub path: String,
162    pub format: InputFormat,
163    pub contigs_found: usize,
164    pub contigs_merged: usize,
165    pub aliases_added: usize,
166}
167
168/// Builder that collates metadata from multiple input sources
169pub struct ReferenceBuilder {
170    id: String,
171    display_name: String,
172    assembly: Option<Assembly>,
173    source: Option<ReferenceSource>,
174    description: Option<String>,
175    download_url: Option<String>,
176    assembly_report_url: Option<String>,
177    tags: Vec<String>,
178
179    /// Contigs keyed by EXACT primary name
180    contigs: HashMap<String, ContigMetadata>,
181
182    /// Preserve original contig order (first seen)
183    contig_order: Vec<String>,
184
185    /// Reverse lookup: alias -> primary name
186    alias_to_primary: HashMap<String, String>,
187
188    /// Records of processed inputs
189    inputs_processed: Vec<InputRecord>,
190
191    /// Conflicts detected
192    conflicts: Vec<String>,
193
194    /// Warnings
195    warnings: Vec<String>,
196
197    /// Whether to generate UCSC-style names for patches when parsing NCBI assembly reports.
198    ///
199    /// When `true` (default), for fix-patches and novel-patches that have "na" in the
200    /// UCSC-style-name column, a UCSC-style name will be generated using the convention:
201    /// `chr{chromosome}_{accession}v{version}_{suffix}` where suffix is `_fix` or `_alt`.
202    ///
203    /// This is useful for matching queries that use UCSC naming (like hg38.p12) against
204    /// NCBI assembly reports that don't include UCSC names for patches.
205    ///
206    /// Set to `false` to disable this behavior and only use names explicitly present
207    /// in the assembly report.
208    ///
209    /// See module documentation for [`crate::parsing::ncbi_report`] for details on the
210    /// naming convention and verification sources.
211    generate_ucsc_names: bool,
212}
213
214impl ReferenceBuilder {
215    /// Create a new builder with required fields
216    pub fn new(id: impl Into<String>, display_name: impl Into<String>) -> Self {
217        Self {
218            id: id.into(),
219            display_name: display_name.into(),
220            assembly: None,
221            source: None,
222            description: None,
223            download_url: None,
224            assembly_report_url: None,
225            tags: Vec::new(),
226            contigs: HashMap::new(),
227            contig_order: Vec::new(),
228            alias_to_primary: HashMap::new(),
229            inputs_processed: Vec::new(),
230            conflicts: Vec::new(),
231            warnings: Vec::new(),
232            generate_ucsc_names: true, // Default: generate UCSC names for patches
233        }
234    }
235
236    /// Configure whether to generate UCSC-style names for patches.
237    ///
238    /// When `true` (default), for fix-patches and novel-patches that have "na" in the
239    /// UCSC-style-name column of NCBI assembly reports, a UCSC-style name will be
240    /// generated using the convention: `chr{chromosome}_{accession}v{version}_{suffix}`
241    /// where suffix is `_fix` for fix-patches or `_alt` for novel-patches.
242    ///
243    /// Set to `false` to disable this behavior (opt-out) and only use names explicitly
244    /// present in the assembly report.
245    ///
246    /// # Example
247    ///
248    /// ```
249    /// use ref_solver::catalog::builder::ReferenceBuilder;
250    ///
251    /// // Opt-out of UCSC name generation
252    /// let builder = ReferenceBuilder::new("my_ref", "My Reference")
253    ///     .generate_ucsc_names(false);
254    /// ```
255    #[must_use]
256    pub fn generate_ucsc_names(mut self, generate: bool) -> Self {
257        self.generate_ucsc_names = generate;
258        self
259    }
260
261    #[must_use]
262    pub fn assembly(mut self, assembly: Assembly) -> Self {
263        self.assembly = Some(assembly);
264        self
265    }
266
267    #[must_use]
268    pub fn source(mut self, source: ReferenceSource) -> Self {
269        self.source = Some(source);
270        self
271    }
272
273    #[must_use]
274    pub fn description(mut self, description: impl Into<String>) -> Self {
275        self.description = Some(description.into());
276        self
277    }
278
279    #[must_use]
280    pub fn download_url(mut self, url: impl Into<String>) -> Self {
281        self.download_url = Some(url.into());
282        self
283    }
284
285    #[must_use]
286    pub fn assembly_report_url(mut self, url: impl Into<String>) -> Self {
287        self.assembly_report_url = Some(url.into());
288        self
289    }
290
291    #[must_use]
292    pub fn tags(mut self, tags: Vec<String>) -> Self {
293        self.tags = tags;
294        self
295    }
296
297    /// Add an input file (auto-detect format)
298    ///
299    /// # Errors
300    ///
301    /// Returns `BuilderError::Parse` if format cannot be detected, or other
302    /// errors from parsing the specific format.
303    pub fn add_input(&mut self, path: &Path) -> Result<(), BuilderError> {
304        let format = InputFormat::from_path(path).ok_or_else(|| {
305            BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
306        })?;
307        self.add_input_with_format(path, format)
308    }
309
310    /// Add an input file with explicit format
311    ///
312    /// # Errors
313    ///
314    /// Returns `BuilderError::Io` if the file cannot be read, `BuilderError::Parse`
315    /// if parsing fails, or `BuilderError::Conflict` if contig data conflicts.
316    pub fn add_input_with_format(
317        &mut self,
318        path: &Path,
319        format: InputFormat,
320    ) -> Result<(), BuilderError> {
321        let path_str = path.display().to_string();
322
323        match format {
324            InputFormat::Dict | InputFormat::Sam => {
325                self.add_dict_or_sam(path, &path_str, format)?;
326            }
327            InputFormat::Bam => {
328                self.add_bam(path, &path_str)?;
329            }
330            InputFormat::Cram => {
331                self.add_cram(path, &path_str)?;
332            }
333            InputFormat::Fai => {
334                self.add_fai(path, &path_str)?;
335            }
336            InputFormat::Fasta => {
337                self.add_fasta(path, &path_str)?;
338            }
339            InputFormat::NcbiReport => {
340                self.add_ncbi_report(path, &path_str)?;
341            }
342            InputFormat::Vcf => {
343                self.add_vcf(path, &path_str)?;
344            }
345            InputFormat::Tsv => {
346                self.add_tsv(path, &path_str)?;
347            }
348        }
349
350        Ok(())
351    }
352
353    fn add_dict_or_sam(
354        &mut self,
355        path: &Path,
356        path_str: &str,
357        format: InputFormat,
358    ) -> Result<(), BuilderError> {
359        let content = std::fs::read_to_string(path)?;
360        let query = crate::parsing::sam::parse_header_text(&content)
361            .map_err(|e| BuilderError::Parse(e.to_string()))?;
362
363        let mut record = InputRecord {
364            path: path_str.to_string(),
365            format,
366            contigs_found: query.contigs.len(),
367            contigs_merged: 0,
368            aliases_added: 0,
369        };
370
371        for contig in query.contigs {
372            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
373            if merged {
374                record.contigs_merged += 1;
375            }
376            record.aliases_added += aliases;
377        }
378
379        self.inputs_processed.push(record);
380        Ok(())
381    }
382
383    fn add_bam(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
384        let query = crate::parsing::sam::parse_file(path)
385            .map_err(|e| BuilderError::Parse(e.to_string()))?;
386
387        let mut record = InputRecord {
388            path: path_str.to_string(),
389            format: InputFormat::Bam,
390            contigs_found: query.contigs.len(),
391            contigs_merged: 0,
392            aliases_added: 0,
393        };
394
395        for contig in query.contigs {
396            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
397            if merged {
398                record.contigs_merged += 1;
399            }
400            record.aliases_added += aliases;
401        }
402
403        self.inputs_processed.push(record);
404        Ok(())
405    }
406
407    fn add_cram(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
408        let query = crate::parsing::sam::parse_file(path)
409            .map_err(|e| BuilderError::Parse(e.to_string()))?;
410
411        let mut record = InputRecord {
412            path: path_str.to_string(),
413            format: InputFormat::Cram,
414            contigs_found: query.contigs.len(),
415            contigs_merged: 0,
416            aliases_added: 0,
417        };
418
419        for contig in query.contigs {
420            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
421            if merged {
422                record.contigs_merged += 1;
423            }
424            record.aliases_added += aliases;
425        }
426
427        self.inputs_processed.push(record);
428        Ok(())
429    }
430
431    fn add_fai(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
432        let content = std::fs::read_to_string(path)?;
433        let query = crate::parsing::fai::parse_fai_text(&content)
434            .map_err(|e| BuilderError::Parse(e.to_string()))?;
435
436        let mut record = InputRecord {
437            path: path_str.to_string(),
438            format: InputFormat::Fai,
439            contigs_found: query.contigs.len(),
440            contigs_merged: 0,
441            aliases_added: 0,
442        };
443
444        for contig in query.contigs {
445            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
446            if merged {
447                record.contigs_merged += 1;
448            }
449            record.aliases_added += aliases;
450        }
451
452        self.inputs_processed.push(record);
453        Ok(())
454    }
455
456    fn add_ncbi_report(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
457        let content = std::fs::read_to_string(path)?;
458        let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
459            .map_err(|e| BuilderError::Parse(e.to_string()))?;
460
461        let mut record = InputRecord {
462            path: path_str.to_string(),
463            format: InputFormat::NcbiReport,
464            contigs_found: entries.len(),
465            contigs_merged: 0,
466            aliases_added: 0,
467        };
468
469        for entry in entries {
470            // Use generate_ucsc_names option to control UCSC name generation for patches
471            let contig = entry.to_contig_with_options(self.generate_ucsc_names);
472            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
473            if merged {
474                record.contigs_merged += 1;
475            }
476            record.aliases_added += aliases;
477        }
478
479        self.inputs_processed.push(record);
480        Ok(())
481    }
482
483    fn add_vcf(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
484        let content = std::fs::read_to_string(path)?;
485        let query = crate::parsing::vcf::parse_vcf_header_text(&content)
486            .map_err(|e| BuilderError::Parse(e.to_string()))?;
487
488        let mut record = InputRecord {
489            path: path_str.to_string(),
490            format: InputFormat::Vcf,
491            contigs_found: query.contigs.len(),
492            contigs_merged: 0,
493            aliases_added: 0,
494        };
495
496        for contig in query.contigs {
497            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
498            if merged {
499                record.contigs_merged += 1;
500            }
501            record.aliases_added += aliases;
502        }
503
504        self.inputs_processed.push(record);
505        Ok(())
506    }
507
508    fn add_tsv(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
509        let content = std::fs::read_to_string(path)?;
510        let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
511            .map_err(|e| BuilderError::Parse(e.to_string()))?;
512
513        let mut record = InputRecord {
514            path: path_str.to_string(),
515            format: InputFormat::Tsv,
516            contigs_found: query.contigs.len(),
517            contigs_merged: 0,
518            aliases_added: 0,
519        };
520
521        for contig in query.contigs {
522            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
523            if merged {
524                record.contigs_merged += 1;
525            }
526            record.aliases_added += aliases;
527        }
528
529        self.inputs_processed.push(record);
530        Ok(())
531    }
532
533    fn add_fasta(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
534        let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
535            .map_err(|e| BuilderError::Parse(e.to_string()))?;
536
537        let mut record = InputRecord {
538            path: path_str.to_string(),
539            format: InputFormat::Fasta,
540            contigs_found: query.contigs.len(),
541            contigs_merged: 0,
542            aliases_added: 0,
543        };
544
545        for contig in query.contigs {
546            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
547            if merged {
548                record.contigs_merged += 1;
549            }
550            record.aliases_added += aliases;
551        }
552
553        self.inputs_processed.push(record);
554        Ok(())
555    }
556
557    /// Merge a contig into the builder.
558    /// Returns (`was_merged_into_existing`, `num_aliases_added`)
559    fn merge_contig(
560        &mut self,
561        contig: &Contig,
562        source: &str,
563    ) -> Result<(bool, usize), BuilderError> {
564        // Find existing entry by exact name or alias
565        let mut existing_primary = self.find_existing_primary(&contig.name);
566
567        // Also check if any of the incoming contig's aliases match existing entries
568        if existing_primary.is_none() {
569            for alias in &contig.aliases {
570                if let Some(primary) = self.find_existing_primary(alias) {
571                    existing_primary = Some(primary);
572                    break;
573                }
574            }
575        }
576
577        if let Some(primary) = existing_primary {
578            // Merge into existing
579            let metadata = self.contigs.get_mut(&primary).unwrap();
580            let aliases_before = metadata.aliases.len();
581
582            // Check length conflict
583            if let Some(existing_len) = metadata.length {
584                if existing_len != contig.length {
585                    let msg = format!(
586                        "Length conflict for '{}': {} vs {} (from {})",
587                        contig.name, existing_len, contig.length, source
588                    );
589                    self.conflicts.push(msg.clone());
590                    return Err(BuilderError::Conflict(msg));
591                }
592            } else {
593                metadata.length = Some(contig.length);
594            }
595
596            // Check MD5 conflict
597            if let (Some(existing_md5), Some(new_md5)) = (&metadata.md5, &contig.md5) {
598                if existing_md5 != new_md5 {
599                    let msg = format!(
600                        "MD5 conflict for '{}': {} vs {} (from {})",
601                        contig.name, existing_md5, new_md5, source
602                    );
603                    self.conflicts.push(msg.clone());
604                    return Err(BuilderError::Conflict(msg));
605                }
606            } else if metadata.md5.is_none() && contig.md5.is_some() {
607                metadata.md5.clone_from(&contig.md5);
608            }
609
610            // Merge aliases
611            for alias in &contig.aliases {
612                if !metadata.aliases.contains(alias) && alias != &metadata.primary_name {
613                    // Check alias doesn't conflict with another contig
614                    if let Some(other_primary) = self.alias_to_primary.get(alias) {
615                        if other_primary != &primary {
616                            self.warnings.push(format!(
617                                "Alias '{alias}' already mapped to '{other_primary}', skipping for '{primary}'"
618                            ));
619                            continue;
620                        }
621                    }
622                    metadata.aliases.insert(alias.clone());
623                    self.alias_to_primary.insert(alias.clone(), primary.clone());
624                }
625            }
626
627            // Also add the contig's name as an alias if different from primary
628            if contig.name != primary && !metadata.aliases.contains(&contig.name) {
629                metadata.aliases.insert(contig.name.clone());
630                self.alias_to_primary
631                    .insert(contig.name.clone(), primary.clone());
632            }
633
634            // Fill other fields
635            if metadata.assembly.is_none() && contig.assembly.is_some() {
636                metadata.assembly.clone_from(&contig.assembly);
637            }
638            if metadata.uri.is_none() && contig.uri.is_some() {
639                metadata.uri.clone_from(&contig.uri);
640            }
641            if metadata.species.is_none() && contig.species.is_some() {
642                metadata.species.clone_from(&contig.species);
643            }
644            // Update sequence role if not already set
645            if matches!(metadata.sequence_role, SequenceRole::Unknown) {
646                metadata.sequence_role = contig.sequence_role;
647            }
648
649            metadata.sources.push(source.to_string());
650
651            let aliases_added = metadata.aliases.len() - aliases_before;
652            Ok((true, aliases_added))
653        } else {
654            // Create new entry
655            let mut metadata = ContigMetadata::new(contig.name.clone());
656            metadata.length = Some(contig.length);
657            metadata.md5.clone_from(&contig.md5);
658            metadata.assembly.clone_from(&contig.assembly);
659            metadata.uri.clone_from(&contig.uri);
660            metadata.species.clone_from(&contig.species);
661            metadata.sequence_role = contig.sequence_role;
662            metadata.sources.push(source.to_string());
663
664            // Add aliases
665            let mut aliases_added = 0;
666            for alias in &contig.aliases {
667                if alias != &contig.name {
668                    if let Some(other_primary) = self.alias_to_primary.get(alias) {
669                        self.warnings.push(format!(
670                            "Alias '{}' already mapped to '{}', skipping for '{}'",
671                            alias, other_primary, contig.name
672                        ));
673                        continue;
674                    }
675                    metadata.aliases.insert(alias.clone());
676                    self.alias_to_primary
677                        .insert(alias.clone(), contig.name.clone());
678                    aliases_added += 1;
679                }
680            }
681
682            self.contig_order.push(contig.name.clone());
683            self.contigs.insert(contig.name.clone(), metadata);
684
685            Ok((false, aliases_added))
686        }
687    }
688
689    /// Find the primary name for a contig name (checks exact match and aliases)
690    fn find_existing_primary(&self, name: &str) -> Option<String> {
691        // Check exact name match
692        if self.contigs.contains_key(name) {
693            return Some(name.to_string());
694        }
695
696        // Check alias match
697        if let Some(primary) = self.alias_to_primary.get(name) {
698            return Some(primary.clone());
699        }
700
701        None
702    }
703
704    /// Build the final `KnownReference`
705    ///
706    /// # Errors
707    ///
708    /// Returns `BuilderError::MissingField` if no contigs were added or required
709    /// fields are missing.
710    pub fn build(self) -> Result<KnownReference, BuilderError> {
711        // Validate
712        if self.contigs.is_empty() {
713            return Err(BuilderError::MissingField("No contigs added".to_string()));
714        }
715
716        // Check for missing lengths
717        let mut missing_length = Vec::new();
718        for name in &self.contig_order {
719            if let Some(metadata) = self.contigs.get(name) {
720                if metadata.length.is_none() {
721                    missing_length.push(name.clone());
722                }
723            }
724        }
725        if !missing_length.is_empty() {
726            return Err(BuilderError::MissingField(format!(
727                "Missing length for contigs: {missing_length:?}"
728            )));
729        }
730
731        // Build contigs in order
732        let contigs: Vec<Contig> = self
733            .contig_order
734            .iter()
735            .filter_map(|name| self.contigs.get(name))
736            .filter_map(ContigMetadata::to_contig)
737            .collect();
738
739        // Find contigs that are in assembly report but missing MD5 (i.e., not in FASTA)
740        // These are typically assembled-molecule contigs that use external references (like MT in CHM13)
741        let mut contigs_missing_from_fasta: Vec<String> = Vec::new();
742        for (name, meta) in &self.contigs {
743            if meta.md5.is_none() && matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
744                contigs_missing_from_fasta.push(name.clone());
745            }
746        }
747        contigs_missing_from_fasta.sort();
748
749        // Determine assembly if not set
750        let assembly = self
751            .assembly
752            .unwrap_or_else(|| detect_assembly_from_name(&self.display_name));
753
754        // Determine source if not set
755        let source = self
756            .source
757            .unwrap_or(ReferenceSource::Custom("Unknown".to_string()));
758
759        // Determine naming convention
760        let naming_convention = crate::core::contig::detect_naming_convention(&contigs);
761
762        let mut reference = KnownReference {
763            id: crate::core::types::ReferenceId::new(&self.id),
764            display_name: self.display_name,
765            assembly,
766            source,
767            naming_convention,
768            download_url: self.download_url,
769            assembly_report_url: self.assembly_report_url,
770            contigs,
771            description: self.description,
772            tags: self.tags,
773            contigs_missing_from_fasta,
774            md5_set: HashSet::new(),
775            name_length_set: HashSet::new(),
776            signature: None,
777        };
778
779        reference.rebuild_indexes();
780        Ok(reference)
781    }
782
783    /// Get summary of build
784    #[must_use]
785    pub fn summary(&self) -> BuildSummary {
786        let total_contigs = self.contigs.len();
787        let with_length = self.contigs.values().filter(|m| m.length.is_some()).count();
788        let with_md5 = self.contigs.values().filter(|m| m.md5.is_some()).count();
789        let with_aliases = self
790            .contigs
791            .values()
792            .filter(|m| !m.aliases.is_empty())
793            .count();
794
795        // Collect assembled-molecule contigs missing MD5 (these are important)
796        let mut missing_md5_assembled: Vec<String> = Vec::new();
797        for (name, meta) in &self.contigs {
798            if meta.md5.is_none() {
799                // Check if it's an assembled-molecule (primary chromosome)
800                if matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
801                    missing_md5_assembled.push(name.clone());
802                }
803            }
804        }
805
806        // Count primary chromosomes
807        let primary_count = self
808            .contigs
809            .keys()
810            .filter(|name| {
811                let contig = Contig::new(name.as_str(), 0);
812                contig.is_primary_chromosome()
813            })
814            .count();
815
816        // Count ALT contigs
817        let alt_count = self
818            .contigs
819            .keys()
820            .filter(|name| name.ends_with("_alt") || name.contains("_alt_"))
821            .count();
822
823        BuildSummary {
824            id: self.id.clone(),
825            display_name: self.display_name.clone(),
826            assembly: self.assembly.clone(),
827            source: self.source.clone(),
828            inputs: self.inputs_processed.clone(),
829            total_contigs,
830            primary_chromosomes: primary_count,
831            alt_contigs: alt_count,
832            other_contigs: total_contigs.saturating_sub(primary_count + alt_count),
833            with_length,
834            with_md5,
835            with_aliases,
836            missing_md5_assembled,
837            conflicts: self.conflicts.clone(),
838            warnings: self.warnings.clone(),
839        }
840    }
841}
842
843/// Summary of the build process
844#[derive(Debug, Clone)]
845pub struct BuildSummary {
846    pub id: String,
847    pub display_name: String,
848    pub assembly: Option<Assembly>,
849    pub source: Option<ReferenceSource>,
850    pub inputs: Vec<InputRecord>,
851    pub total_contigs: usize,
852    pub primary_chromosomes: usize,
853    pub alt_contigs: usize,
854    pub other_contigs: usize,
855    pub with_length: usize,
856    pub with_md5: usize,
857    pub with_aliases: usize,
858    pub missing_md5_assembled: Vec<String>,
859    pub conflicts: Vec<String>,
860    pub warnings: Vec<String>,
861}
862
863impl std::fmt::Display for BuildSummary {
864    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
865        writeln!(f, "Reference Builder Summary")?;
866        writeln!(f, "=========================")?;
867        writeln!(f, "ID:       {}", self.id)?;
868        writeln!(f, "Name:     {}", self.display_name)?;
869        if let Some(ref assembly) = self.assembly {
870            writeln!(f, "Assembly: {assembly:?}")?;
871        }
872        if let Some(ref source) = self.source {
873            writeln!(f, "Source:   {source:?}")?;
874        }
875        writeln!(f)?;
876
877        writeln!(f, "Inputs:")?;
878        for (i, input) in self.inputs.iter().enumerate() {
879            writeln!(
880                f,
881                "  [{}] {} ({:?}) -> {} contigs, {} merged, {} aliases",
882                i + 1,
883                input.path,
884                input.format,
885                input.contigs_found,
886                input.contigs_merged,
887                input.aliases_added
888            )?;
889        }
890        writeln!(f)?;
891
892        writeln!(f, "Contigs: {} total", self.total_contigs)?;
893        writeln!(f, "  - Primary chromosomes: {}", self.primary_chromosomes)?;
894        writeln!(f, "  - ALT contigs: {}", self.alt_contigs)?;
895        writeln!(f, "  - Other: {}", self.other_contigs)?;
896        writeln!(f)?;
897
898        writeln!(f, "Coverage:")?;
899        let pct = |n: usize, total: usize| {
900            if total == 0 {
901                0
902            } else {
903                (n * 100) / total
904            }
905        };
906        let check = |n: usize, total: usize| {
907            if n == total {
908                "+"
909            } else {
910                "o"
911            }
912        };
913        writeln!(
914            f,
915            "  {} Length:  {}/{} ({}%)",
916            check(self.with_length, self.total_contigs),
917            self.with_length,
918            self.total_contigs,
919            pct(self.with_length, self.total_contigs)
920        )?;
921        writeln!(
922            f,
923            "  {} MD5:     {}/{} ({}%)",
924            check(self.with_md5, self.total_contigs),
925            self.with_md5,
926            self.total_contigs,
927            pct(self.with_md5, self.total_contigs)
928        )?;
929        writeln!(
930            f,
931            "  {} Aliases: {}/{} ({}%)",
932            check(self.with_aliases, self.total_contigs),
933            self.with_aliases,
934            self.total_contigs,
935            pct(self.with_aliases, self.total_contigs)
936        )?;
937        writeln!(f)?;
938
939        writeln!(f, "Conflicts: {}", self.conflicts.len())?;
940        for conflict in &self.conflicts {
941            writeln!(f, "  - {conflict}")?;
942        }
943
944        let total_warnings = self.warnings.len() + self.missing_md5_assembled.len();
945        writeln!(f, "Warnings: {total_warnings}")?;
946        for warning in &self.warnings {
947            writeln!(f, "  - {warning}")?;
948        }
949        if !self.missing_md5_assembled.is_empty() {
950            writeln!(
951                f,
952                "  - Missing MD5 for assembled-molecule contigs: {}",
953                self.missing_md5_assembled.join(", ")
954            )?;
955        }
956
957        Ok(())
958    }
959}
960
961/// Builder for creating `FastaDistribution` from multiple input files
962///
963/// This builder merges contig metadata from multiple sources (dict, VCF, SAM/BAM, etc.)
964/// keyed by (name, length). It handles:
965/// - MD5 merging (first non-empty wins, error on conflict)
966/// - Alias merging (union of all)
967/// - Sort order preservation (first seen order)
968#[derive(Debug)]
969pub struct DistributionBuilder {
970    id: String,
971    display_name: String,
972    source: ReferenceSource,
973    download_url: Option<String>,
974    tags: Vec<String>,
975
976    /// Contigs keyed by (name, length) for merging
977    contigs: HashMap<(String, u64), FastaContig>,
978
979    /// Track insertion order
980    insertion_order: Vec<(String, u64)>,
981
982    /// Source files processed
983    source_files: Vec<String>,
984
985    /// Whether to generate UCSC-style names for patches (see [`ReferenceBuilder`])
986    generate_ucsc_names: bool,
987}
988
989impl Default for DistributionBuilder {
990    fn default() -> Self {
991        Self::new("")
992    }
993}
994
995impl DistributionBuilder {
996    /// Create a new builder with the given distribution ID
997    pub fn new(id: impl Into<String>) -> Self {
998        Self {
999            id: id.into(),
1000            display_name: String::new(),
1001            source: ReferenceSource::Custom("custom".to_string()),
1002            download_url: None,
1003            tags: Vec::new(),
1004            contigs: HashMap::new(),
1005            insertion_order: Vec::new(),
1006            source_files: Vec::new(),
1007            generate_ucsc_names: true, // Default: generate UCSC names for patches
1008        }
1009    }
1010
1011    /// Configure whether to generate UCSC-style names for patches.
1012    ///
1013    /// See [`ReferenceBuilder::generate_ucsc_names`] for details.
1014    #[must_use]
1015    pub fn with_generate_ucsc_names(mut self, generate: bool) -> Self {
1016        self.generate_ucsc_names = generate;
1017        self
1018    }
1019
1020    /// Set the display name
1021    #[must_use]
1022    pub fn with_display_name(mut self, name: impl Into<String>) -> Self {
1023        self.display_name = name.into();
1024        self
1025    }
1026
1027    /// Set the reference source
1028    #[must_use]
1029    pub fn with_source(mut self, source: ReferenceSource) -> Self {
1030        self.source = source;
1031        self
1032    }
1033
1034    /// Set the download URL
1035    #[must_use]
1036    pub fn with_download_url(mut self, url: impl Into<String>) -> Self {
1037        self.download_url = Some(url.into());
1038        self
1039    }
1040
1041    /// Set tags
1042    #[must_use]
1043    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
1044        self.tags = tags;
1045        self
1046    }
1047
1048    /// Add an input file (auto-detect format)
1049    ///
1050    /// # Errors
1051    ///
1052    /// Returns `BuilderError::Parse` if format cannot be detected, or other
1053    /// errors from parsing the specific format.
1054    pub fn add_input(&mut self, path: &Path) -> Result<&mut Self, BuilderError> {
1055        let format = InputFormat::from_path(path).ok_or_else(|| {
1056            BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
1057        })?;
1058        self.add_input_with_format(path, format)
1059    }
1060
1061    /// Add an input file with explicit format
1062    ///
1063    /// # Errors
1064    ///
1065    /// Returns `BuilderError::Io` if the file cannot be read, `BuilderError::Parse`
1066    /// if parsing fails, or `BuilderError::Conflict` if contig data conflicts.
1067    pub fn add_input_with_format(
1068        &mut self,
1069        path: &Path,
1070        format: InputFormat,
1071    ) -> Result<&mut Self, BuilderError> {
1072        let path_str = path.display().to_string();
1073        self.source_files.push(path_str.clone());
1074
1075        let contigs = self.parse_input(path, format)?;
1076
1077        for contig in contigs {
1078            let key = (contig.name.clone(), contig.length);
1079            #[allow(clippy::cast_possible_truncation)] // Contig count limited by MAX_CONTIGS (50k)
1080            let fasta_contig = FastaContig {
1081                name: contig.name,
1082                length: contig.length,
1083                md5: contig.md5.unwrap_or_default(),
1084                sort_order: self.insertion_order.len() as u32,
1085                report_contig_id: None,
1086                aliases: contig.aliases,
1087            };
1088
1089            if let Some(existing) = self.contigs.get_mut(&key) {
1090                existing.merge(&fasta_contig)?;
1091            } else {
1092                self.insertion_order.push(key.clone());
1093                self.contigs.insert(key, fasta_contig);
1094            }
1095        }
1096
1097        Ok(self)
1098    }
1099
1100    /// Parse contigs from an input file
1101    fn parse_input(&self, path: &Path, format: InputFormat) -> Result<Vec<Contig>, BuilderError> {
1102        match format {
1103            InputFormat::Dict | InputFormat::Sam => {
1104                let content = std::fs::read_to_string(path)?;
1105                let query = crate::parsing::sam::parse_header_text(&content)
1106                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1107                Ok(query.contigs)
1108            }
1109            InputFormat::Bam | InputFormat::Cram => {
1110                let query = crate::parsing::sam::parse_file(path)
1111                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1112                Ok(query.contigs)
1113            }
1114            InputFormat::Fai => {
1115                let content = std::fs::read_to_string(path)?;
1116                let query = crate::parsing::fai::parse_fai_text(&content)
1117                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1118                Ok(query.contigs)
1119            }
1120            InputFormat::Fasta => {
1121                // Parse FASTA with MD5 computation
1122                let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
1123                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1124                Ok(query.contigs)
1125            }
1126            InputFormat::NcbiReport => {
1127                let content = std::fs::read_to_string(path)?;
1128                let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
1129                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1130                // Use generate_ucsc_names option to control UCSC name generation for patches
1131                Ok(entries
1132                    .into_iter()
1133                    .map(|e| e.to_contig_with_options(self.generate_ucsc_names))
1134                    .collect())
1135            }
1136            InputFormat::Vcf => {
1137                let content = std::fs::read_to_string(path)?;
1138                let query = crate::parsing::vcf::parse_vcf_header_text(&content)
1139                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1140                Ok(query.contigs)
1141            }
1142            InputFormat::Tsv => {
1143                let content = std::fs::read_to_string(path)?;
1144                let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
1145                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1146                Ok(query.contigs)
1147            }
1148        }
1149    }
1150
1151    /// Build the `FastaDistribution`
1152    ///
1153    /// # Errors
1154    ///
1155    /// Returns `BuilderError::MissingField` if no contigs were found.
1156    pub fn build(self) -> Result<FastaDistribution, BuilderError> {
1157        if self.contigs.is_empty() {
1158            return Err(BuilderError::MissingField("No contigs found".to_string()));
1159        }
1160
1161        // Build contigs in insertion order
1162        let mut contigs: Vec<FastaContig> = Vec::with_capacity(self.insertion_order.len());
1163        for (i, key) in self.insertion_order.iter().enumerate() {
1164            if let Some(mut contig) = self.contigs.get(key).cloned() {
1165                #[allow(clippy::cast_possible_truncation)] // Contig count limited
1166                {
1167                    contig.sort_order = i as u32;
1168                }
1169                contigs.push(contig);
1170            }
1171        }
1172
1173        Ok(FastaDistribution {
1174            id: self.id,
1175            display_name: self.display_name,
1176            source: self.source,
1177            download_url: self.download_url,
1178            tags: self.tags,
1179            contigs,
1180        })
1181    }
1182}
1183
1184/// Detect assembly version from display name
1185#[must_use]
1186pub fn detect_assembly_from_name(display_name: &str) -> Assembly {
1187    let lower = display_name.to_lowercase();
1188    if lower.contains("chm13") || lower.contains("t2t") {
1189        Assembly::Other("CHM13".to_string())
1190    } else if lower.contains("38") || lower.contains("hg38") || lower.contains("hs38") {
1191        Assembly::Grch38
1192    } else if lower.contains("37")
1193        || lower.contains("19")
1194        || lower.contains("hg19")
1195        || lower.contains("hs37")
1196        || lower.contains("b37")
1197    {
1198        Assembly::Grch37
1199    } else {
1200        Assembly::Other(display_name.to_string())
1201    }
1202}
1203
1204#[cfg(test)]
1205mod tests {
1206    use super::*;
1207
1208    #[test]
1209    fn test_input_format_detection() {
1210        assert_eq!(
1211            InputFormat::from_path(Path::new("test.dict")),
1212            Some(InputFormat::Dict)
1213        );
1214        assert_eq!(
1215            InputFormat::from_path(Path::new("test.fai")),
1216            Some(InputFormat::Fai)
1217        );
1218        assert_eq!(
1219            InputFormat::from_path(Path::new("test.fa")),
1220            Some(InputFormat::Fasta)
1221        );
1222        assert_eq!(
1223            InputFormat::from_path(Path::new("test.fasta")),
1224            Some(InputFormat::Fasta)
1225        );
1226        assert_eq!(
1227            InputFormat::from_path(Path::new("test.fna")),
1228            Some(InputFormat::Fasta)
1229        );
1230        assert_eq!(
1231            InputFormat::from_path(Path::new("test.fa.gz")),
1232            Some(InputFormat::Fasta)
1233        );
1234        assert_eq!(
1235            InputFormat::from_path(Path::new("test.vcf")),
1236            Some(InputFormat::Vcf)
1237        );
1238        assert_eq!(
1239            InputFormat::from_path(Path::new("test.vcf.gz")),
1240            Some(InputFormat::Vcf)
1241        );
1242        assert_eq!(
1243            InputFormat::from_path(Path::new("GRCh38_assembly_report.txt")),
1244            Some(InputFormat::NcbiReport)
1245        );
1246    }
1247
1248    #[test]
1249    fn test_detect_assembly() {
1250        assert!(matches!(
1251            detect_assembly_from_name("GRCh38 (Broad)"),
1252            Assembly::Grch38
1253        ));
1254        assert!(matches!(
1255            detect_assembly_from_name("hg38 UCSC"),
1256            Assembly::Grch38
1257        ));
1258        assert!(matches!(
1259            detect_assembly_from_name("GRCh37"),
1260            Assembly::Grch37
1261        ));
1262        assert!(matches!(
1263            detect_assembly_from_name("hg19"),
1264            Assembly::Grch37
1265        ));
1266        assert!(matches!(
1267            detect_assembly_from_name("T2T-CHM13"),
1268            Assembly::Other(_)
1269        ));
1270    }
1271
1272    #[test]
1273    fn test_builder_single_input() {
1274        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1275            .assembly(Assembly::Grch38)
1276            .source(ReferenceSource::Custom("test".to_string()));
1277
1278        // Manually add a contig
1279        let contig = Contig::new("chr1", 248_956_422);
1280        builder.merge_contig(&contig, "test").unwrap();
1281
1282        let reference = builder.build().unwrap();
1283        assert_eq!(reference.id.0, "test_ref");
1284        assert_eq!(reference.contigs.len(), 1);
1285        assert_eq!(reference.contigs[0].name, "chr1");
1286    }
1287
1288    #[test]
1289    fn test_builder_merge_with_aliases() {
1290        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1291
1292        // Add first contig
1293        let contig1 = Contig::new("chr1", 248_956_422);
1294        builder.merge_contig(&contig1, "source1").unwrap();
1295
1296        // Add same contig with different name (as alias)
1297        let mut contig2 = Contig::new("1", 248_956_422);
1298        contig2.aliases = vec!["chr1".to_string()];
1299        builder.merge_contig(&contig2, "source2").unwrap();
1300
1301        // Should have merged into one entry
1302        let summary = builder.summary();
1303        assert_eq!(summary.total_contigs, 1);
1304
1305        let reference = builder.build().unwrap();
1306        assert_eq!(reference.contigs.len(), 1);
1307        // The alias "1" should be in aliases
1308        assert!(reference.contigs[0].aliases.contains(&"1".to_string()));
1309    }
1310
1311    #[test]
1312    fn test_builder_conflict_detection() {
1313        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1314
1315        // Add first contig
1316        let contig1 = Contig::new("chr1", 248_956_422);
1317        builder.merge_contig(&contig1, "source1").unwrap();
1318
1319        // Add conflicting contig (different length, same name)
1320        let contig2 = Contig::new("chr1", 100_000);
1321        let result = builder.merge_contig(&contig2, "source2");
1322        assert!(result.is_err());
1323        assert!(matches!(result.unwrap_err(), BuilderError::Conflict(_)));
1324    }
1325
1326    #[test]
1327    fn test_distribution_builder_single_dict() {
1328        use std::io::Write;
1329        use tempfile::NamedTempFile;
1330
1331        let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1332        writeln!(file, "@HD\tVN:1.6").unwrap();
1333        writeln!(
1334            file,
1335            "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1336        )
1337        .unwrap();
1338        writeln!(
1339            file,
1340            "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c"
1341        )
1342        .unwrap();
1343
1344        let mut builder = DistributionBuilder::new("test_ref");
1345        builder.add_input(file.path()).unwrap();
1346        let dist = builder.build().unwrap();
1347
1348        assert_eq!(dist.contigs.len(), 2);
1349        assert_eq!(dist.contigs[0].name, "chr1");
1350        assert_eq!(dist.contigs[0].md5, "6aef897c3d6ff0c78aff06ac189178dd");
1351        assert_eq!(dist.contigs[1].name, "chr2");
1352        assert_eq!(dist.contigs[1].md5, "f98db672eb0993dcfdabafe2a882905c");
1353    }
1354
1355    #[test]
1356    fn test_distribution_builder_merges_inputs() {
1357        use std::io::Write;
1358        use tempfile::NamedTempFile;
1359
1360        // Dict without MD5
1361        let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1362        writeln!(dict, "@HD\tVN:1.6").unwrap();
1363        writeln!(dict, "@SQ\tSN:chr1\tLN:1000").unwrap();
1364
1365        // VCF with MD5
1366        let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1367        writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1368        writeln!(
1369            vcf,
1370            "##contig=<ID=chr1,length=1000,md5=6aef897c3d6ff0c78aff06ac189178dd>"
1371        )
1372        .unwrap();
1373        writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1374
1375        let mut builder = DistributionBuilder::new("test_ref");
1376        builder.add_input(dict.path()).unwrap();
1377        builder.add_input(vcf.path()).unwrap();
1378        let distribution = builder.build().unwrap();
1379
1380        assert_eq!(distribution.contigs.len(), 1);
1381        assert_eq!(
1382            distribution.contigs[0].md5,
1383            "6aef897c3d6ff0c78aff06ac189178dd"
1384        );
1385    }
1386
1387    #[test]
1388    fn test_distribution_builder_md5_conflict() {
1389        use std::io::Write;
1390        use tempfile::NamedTempFile;
1391
1392        let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1393        writeln!(dict, "@HD\tVN:1.6").unwrap();
1394        writeln!(
1395            dict,
1396            "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1397        )
1398        .unwrap();
1399
1400        let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1401        writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1402        writeln!(
1403            vcf,
1404            "##contig=<ID=chr1,length=1000,md5=f98db672eb0993dcfdabafe2a882905c>"
1405        )
1406        .unwrap();
1407        writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1408
1409        let mut builder = DistributionBuilder::new("test_ref");
1410        builder.add_input(dict.path()).unwrap();
1411        let result = builder.add_input(vcf.path());
1412
1413        assert!(matches!(result, Err(BuilderError::Merge(_))));
1414    }
1415
1416    #[test]
1417    fn test_distribution_builder_preserves_order() {
1418        use std::io::Write;
1419        use tempfile::NamedTempFile;
1420
1421        let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1422        writeln!(file, "@HD\tVN:1.6").unwrap();
1423        writeln!(
1424            file,
1425            "@SQ\tSN:chrM\tLN:16569\tM5:d2ed829b8a1628d16cbeee88e88e39eb"
1426        )
1427        .unwrap();
1428        writeln!(
1429            file,
1430            "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1431        )
1432        .unwrap();
1433        writeln!(
1434            file,
1435            "@SQ\tSN:chr2\tLN:242193529\tM5:f98db672eb0993dcfdabafe2a882905c"
1436        )
1437        .unwrap();
1438
1439        let mut builder = DistributionBuilder::new("test_ref");
1440        builder.add_input(file.path()).unwrap();
1441        let dist = builder.build().unwrap();
1442
1443        assert_eq!(dist.contigs[0].name, "chrM");
1444        assert_eq!(dist.contigs[0].sort_order, 0);
1445        assert_eq!(dist.contigs[1].name, "chr1");
1446        assert_eq!(dist.contigs[1].sort_order, 1);
1447        assert_eq!(dist.contigs[2].name, "chr2");
1448        assert_eq!(dist.contigs[2].sort_order, 2);
1449    }
1450}