Skip to main content

ref_solver/catalog/
builder.rs

1//! Reference builder for creating catalog entries from multiple input sources.
2//!
3//! The `ReferenceBuilder` collates metadata from multiple input files (dict, FAI,
4//! NCBI assembly reports, VCF headers, etc.) and produces a `KnownReference` with
5//! all contigs and aliases properly merged.
6
7use std::collections::{HashMap, HashSet};
8use std::path::Path;
9use thiserror::Error;
10
11use crate::core::assembly::{ContigMergeError, FastaContig, FastaDistribution};
12use crate::core::contig::{Contig, SequenceRole};
13use crate::core::reference::KnownReference;
14use crate::core::types::{Assembly, ReferenceSource};
15use crate::utils::validation::{is_valid_md5, is_valid_sha512t24u};
16
17#[derive(Error, Debug)]
18#[non_exhaustive]
19pub enum BuilderError {
20    #[error("IO error: {0}")]
21    Io(#[from] std::io::Error),
22
23    #[error("Parse error: {0}")]
24    Parse(String),
25
26    #[error("Conflict: {0}")]
27    Conflict(String),
28
29    #[error("Validation error: {0}")]
30    Validation(String),
31
32    #[error("Missing required field: {0}")]
33    MissingField(String),
34
35    #[error("Merge error: {0}")]
36    Merge(#[from] ContigMergeError),
37}
38
39/// Input format for auto-detection
40#[derive(Debug, Clone, Copy, PartialEq, Eq)]
41pub enum InputFormat {
42    Dict,
43    Fai,
44    Fasta,
45    NcbiReport,
46    Sam,
47    Bam,
48    Cram,
49    Vcf,
50    Tsv,
51}
52
53impl InputFormat {
54    /// Detect format from file extension
55    #[must_use]
56    pub fn from_path(path: &Path) -> Option<Self> {
57        let name = path.file_name()?.to_str()?;
58        let name_lower = name.to_lowercase();
59
60        // Check for NCBI assembly report pattern first
61        // name_lower is already lowercase, so case-sensitive check is fine
62        #[allow(clippy::case_sensitive_file_extension_comparisons)]
63        if name_lower.contains("_assembly_report") && name_lower.ends_with(".txt") {
64            return Some(Self::NcbiReport);
65        }
66
67        // Check for gzipped FASTA files (.fa.gz, .fasta.gz, .fna.gz)
68        if name_lower.ends_with(".fa.gz")
69            || name_lower.ends_with(".fasta.gz")
70            || name_lower.ends_with(".fna.gz")
71            || name_lower.ends_with(".fa.bgz")
72            || name_lower.ends_with(".fasta.bgz")
73            || name_lower.ends_with(".fna.bgz")
74        {
75            return Some(Self::Fasta);
76        }
77
78        let ext = path.extension()?.to_str()?.to_lowercase();
79        match ext.as_str() {
80            "dict" => Some(Self::Dict),
81            "fai" => Some(Self::Fai),
82            "fa" | "fasta" | "fna" => Some(Self::Fasta),
83            "sam" => Some(Self::Sam),
84            "bam" => Some(Self::Bam),
85            "cram" => Some(Self::Cram),
86            "vcf" => Some(Self::Vcf),
87            "tsv" | "txt" => Some(Self::Tsv),
88            "gz" => {
89                // Check for .vcf.gz (stem is lowercased)
90                let stem = path.file_stem()?.to_str()?.to_lowercase();
91                #[allow(clippy::case_sensitive_file_extension_comparisons)]
92                if stem.ends_with(".vcf") {
93                    Some(Self::Vcf)
94                } else {
95                    None
96                }
97            }
98            _ => None,
99        }
100    }
101}
102
103/// Metadata for a single contig, collected from multiple sources
104#[derive(Debug, Clone)]
105#[non_exhaustive]
106pub struct ContigMetadata {
107    /// Primary name (exact, from first source)
108    pub primary_name: String,
109
110    /// Length (must be consistent across sources)
111    pub length: Option<u64>,
112
113    /// MD5 checksum
114    pub md5: Option<String>,
115
116    /// GA4GH sha512t24u digest
117    pub sha512t24u: Option<String>,
118
119    /// Explicit aliases (from AN tag or NCBI report)
120    pub aliases: HashSet<String>,
121
122    /// Assembly tag (AS)
123    pub assembly: Option<String>,
124
125    /// URI (UR)
126    pub uri: Option<String>,
127
128    /// Species (SP)
129    pub species: Option<String>,
130
131    /// Sequence role from NCBI assembly report
132    pub sequence_role: SequenceRole,
133
134    /// Sources that contributed to this contig
135    pub sources: Vec<String>,
136}
137
138impl ContigMetadata {
139    fn new(name: String) -> Self {
140        Self {
141            primary_name: name,
142            length: None,
143            md5: None,
144            sha512t24u: None,
145            aliases: HashSet::new(),
146            assembly: None,
147            uri: None,
148            species: None,
149            sequence_role: SequenceRole::Unknown,
150            sources: Vec::new(),
151        }
152    }
153
154    /// Convert to a Contig
155    fn to_contig(&self) -> Option<Contig> {
156        let length = self.length?;
157        let mut contig = Contig::new(&self.primary_name, length);
158        contig.md5.clone_from(&self.md5);
159        contig.sha512t24u.clone_from(&self.sha512t24u);
160        contig.assembly.clone_from(&self.assembly);
161        contig.uri.clone_from(&self.uri);
162        contig.species.clone_from(&self.species);
163        contig.aliases = self.aliases.iter().cloned().collect();
164        contig.sequence_role = self.sequence_role;
165        Some(contig)
166    }
167}
168
169/// Record of a processed input file
170#[derive(Debug, Clone)]
171pub struct InputRecord {
172    pub path: String,
173    pub format: InputFormat,
174    pub contigs_found: usize,
175    pub contigs_merged: usize,
176    pub aliases_added: usize,
177}
178
179/// Builder that collates metadata from multiple input sources
180pub struct ReferenceBuilder {
181    id: String,
182    display_name: String,
183    assembly: Option<Assembly>,
184    source: Option<ReferenceSource>,
185    description: Option<String>,
186    download_url: Option<String>,
187    assembly_report_url: Option<String>,
188    tags: Vec<String>,
189
190    /// Contigs keyed by EXACT primary name
191    contigs: HashMap<String, ContigMetadata>,
192
193    /// Preserve original contig order (first seen)
194    contig_order: Vec<String>,
195
196    /// Reverse lookup: alias -> primary name
197    alias_to_primary: HashMap<String, String>,
198
199    /// Records of processed inputs
200    inputs_processed: Vec<InputRecord>,
201
202    /// Conflicts detected
203    conflicts: Vec<String>,
204
205    /// Warnings
206    warnings: Vec<String>,
207
208    /// Optional species override for all contigs (e.g., "Homo sapiens").
209    /// When set, overrides whatever the dict's SP tag had.
210    species: Option<String>,
211
212    /// Whether to generate UCSC-style names for patches when parsing NCBI assembly reports.
213    ///
214    /// When `true` (default), for fix-patches and novel-patches that have "na" in the
215    /// UCSC-style-name column, a UCSC-style name will be generated using the convention:
216    /// `chr{chromosome}_{accession}v{version}_{suffix}` where suffix is `_fix` or `_alt`.
217    ///
218    /// This is useful for matching queries that use UCSC naming (like hg38.p12) against
219    /// NCBI assembly reports that don't include UCSC names for patches.
220    ///
221    /// Set to `false` to disable this behavior and only use names explicitly present
222    /// in the assembly report.
223    ///
224    /// See module documentation for [`crate::parsing::ncbi_report`] for details on the
225    /// naming convention and verification sources.
226    generate_ucsc_names: bool,
227}
228
229impl ReferenceBuilder {
230    /// Create a new builder with required fields
231    pub fn new(id: impl Into<String>, display_name: impl Into<String>) -> Self {
232        Self {
233            id: id.into(),
234            display_name: display_name.into(),
235            assembly: None,
236            source: None,
237            description: None,
238            download_url: None,
239            assembly_report_url: None,
240            tags: Vec::new(),
241            contigs: HashMap::new(),
242            contig_order: Vec::new(),
243            alias_to_primary: HashMap::new(),
244            inputs_processed: Vec::new(),
245            conflicts: Vec::new(),
246            warnings: Vec::new(),
247            species: None,
248            generate_ucsc_names: true, // Default: generate UCSC names for patches
249        }
250    }
251
252    /// Configure whether to generate UCSC-style names for patches.
253    ///
254    /// When `true` (default), for fix-patches and novel-patches that have "na" in the
255    /// UCSC-style-name column of NCBI assembly reports, a UCSC-style name will be
256    /// generated using the convention: `chr{chromosome}_{accession}v{version}_{suffix}`
257    /// where suffix is `_fix` for fix-patches or `_alt` for novel-patches.
258    ///
259    /// Set to `false` to disable this behavior (opt-out) and only use names explicitly
260    /// present in the assembly report.
261    ///
262    /// # Example
263    ///
264    /// ```
265    /// use ref_solver::catalog::builder::ReferenceBuilder;
266    ///
267    /// // Opt-out of UCSC name generation
268    /// let builder = ReferenceBuilder::new("my_ref", "My Reference")
269    ///     .generate_ucsc_names(false);
270    /// ```
271    #[must_use]
272    pub fn generate_ucsc_names(mut self, generate: bool) -> Self {
273        self.generate_ucsc_names = generate;
274        self
275    }
276
277    #[must_use]
278    pub fn assembly(mut self, assembly: Assembly) -> Self {
279        self.assembly = Some(assembly);
280        self
281    }
282
283    #[must_use]
284    pub fn source(mut self, source: ReferenceSource) -> Self {
285        self.source = Some(source);
286        self
287    }
288
289    #[must_use]
290    pub fn description(mut self, description: impl Into<String>) -> Self {
291        self.description = Some(description.into());
292        self
293    }
294
295    #[must_use]
296    pub fn download_url(mut self, url: impl Into<String>) -> Self {
297        self.download_url = Some(url.into());
298        self
299    }
300
301    #[must_use]
302    pub fn assembly_report_url(mut self, url: impl Into<String>) -> Self {
303        self.assembly_report_url = Some(url.into());
304        self
305    }
306
307    #[must_use]
308    pub fn tags(mut self, tags: Vec<String>) -> Self {
309        self.tags = tags;
310        self
311    }
312
313    /// Set a species override for all contigs (e.g., "Homo sapiens").
314    ///
315    /// When set, this value is written to each contig's `species` field, overriding
316    /// whatever the dict's SP tag had.
317    #[must_use]
318    pub fn species(mut self, species: impl Into<String>) -> Self {
319        self.species = Some(species.into());
320        self
321    }
322
323    /// Add an input file (auto-detect format)
324    ///
325    /// # Errors
326    ///
327    /// Returns `BuilderError::Parse` if format cannot be detected, or other
328    /// errors from parsing the specific format.
329    pub fn add_input(&mut self, path: &Path) -> Result<(), BuilderError> {
330        let format = InputFormat::from_path(path).ok_or_else(|| {
331            BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
332        })?;
333        self.add_input_with_format(path, format)
334    }
335
336    /// Add an input file with explicit format
337    ///
338    /// # Errors
339    ///
340    /// Returns `BuilderError::Io` if the file cannot be read, `BuilderError::Parse`
341    /// if parsing fails, or `BuilderError::Conflict` if contig data conflicts.
342    pub fn add_input_with_format(
343        &mut self,
344        path: &Path,
345        format: InputFormat,
346    ) -> Result<(), BuilderError> {
347        let path_str = path.display().to_string();
348
349        match format {
350            InputFormat::Dict | InputFormat::Sam => {
351                self.add_dict_or_sam(path, &path_str, format)?;
352            }
353            InputFormat::Bam => {
354                self.add_bam(path, &path_str)?;
355            }
356            InputFormat::Cram => {
357                self.add_cram(path, &path_str)?;
358            }
359            InputFormat::Fai => {
360                self.add_fai(path, &path_str)?;
361            }
362            InputFormat::Fasta => {
363                self.add_fasta(path, &path_str)?;
364            }
365            InputFormat::NcbiReport => {
366                self.add_ncbi_report(path, &path_str)?;
367            }
368            InputFormat::Vcf => {
369                self.add_vcf(path, &path_str)?;
370            }
371            InputFormat::Tsv => {
372                self.add_tsv(path, &path_str)?;
373            }
374        }
375
376        Ok(())
377    }
378
379    fn add_dict_or_sam(
380        &mut self,
381        path: &Path,
382        path_str: &str,
383        format: InputFormat,
384    ) -> Result<(), BuilderError> {
385        let content = std::fs::read_to_string(path)?;
386        let query = crate::parsing::sam::parse_header_text(&content)
387            .map_err(|e| BuilderError::Parse(e.to_string()))?;
388
389        let mut record = InputRecord {
390            path: path_str.to_string(),
391            format,
392            contigs_found: query.contigs.len(),
393            contigs_merged: 0,
394            aliases_added: 0,
395        };
396
397        for contig in query.contigs {
398            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
399            if merged {
400                record.contigs_merged += 1;
401            }
402            record.aliases_added += aliases;
403        }
404
405        self.inputs_processed.push(record);
406        Ok(())
407    }
408
409    fn add_bam(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
410        let query = crate::parsing::sam::parse_file(path)
411            .map_err(|e| BuilderError::Parse(e.to_string()))?;
412
413        let mut record = InputRecord {
414            path: path_str.to_string(),
415            format: InputFormat::Bam,
416            contigs_found: query.contigs.len(),
417            contigs_merged: 0,
418            aliases_added: 0,
419        };
420
421        for contig in query.contigs {
422            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
423            if merged {
424                record.contigs_merged += 1;
425            }
426            record.aliases_added += aliases;
427        }
428
429        self.inputs_processed.push(record);
430        Ok(())
431    }
432
433    fn add_cram(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
434        let query = crate::parsing::sam::parse_file(path)
435            .map_err(|e| BuilderError::Parse(e.to_string()))?;
436
437        let mut record = InputRecord {
438            path: path_str.to_string(),
439            format: InputFormat::Cram,
440            contigs_found: query.contigs.len(),
441            contigs_merged: 0,
442            aliases_added: 0,
443        };
444
445        for contig in query.contigs {
446            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
447            if merged {
448                record.contigs_merged += 1;
449            }
450            record.aliases_added += aliases;
451        }
452
453        self.inputs_processed.push(record);
454        Ok(())
455    }
456
457    fn add_fai(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
458        let content = std::fs::read_to_string(path)?;
459        let query = crate::parsing::fai::parse_fai_text(&content)
460            .map_err(|e| BuilderError::Parse(e.to_string()))?;
461
462        let mut record = InputRecord {
463            path: path_str.to_string(),
464            format: InputFormat::Fai,
465            contigs_found: query.contigs.len(),
466            contigs_merged: 0,
467            aliases_added: 0,
468        };
469
470        for contig in query.contigs {
471            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
472            if merged {
473                record.contigs_merged += 1;
474            }
475            record.aliases_added += aliases;
476        }
477
478        self.inputs_processed.push(record);
479        Ok(())
480    }
481
482    fn add_ncbi_report(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
483        let content = std::fs::read_to_string(path)?;
484        let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
485            .map_err(|e| BuilderError::Parse(e.to_string()))?;
486
487        let mut record = InputRecord {
488            path: path_str.to_string(),
489            format: InputFormat::NcbiReport,
490            contigs_found: entries.len(),
491            contigs_merged: 0,
492            aliases_added: 0,
493        };
494
495        for entry in entries {
496            // Use generate_ucsc_names option to control UCSC name generation for patches
497            let contig = entry.to_contig_with_options(self.generate_ucsc_names);
498            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
499            if merged {
500                record.contigs_merged += 1;
501            }
502            record.aliases_added += aliases;
503        }
504
505        self.inputs_processed.push(record);
506        Ok(())
507    }
508
509    fn add_vcf(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
510        let content = std::fs::read_to_string(path)?;
511        let query = crate::parsing::vcf::parse_vcf_header_text(&content)
512            .map_err(|e| BuilderError::Parse(e.to_string()))?;
513
514        let mut record = InputRecord {
515            path: path_str.to_string(),
516            format: InputFormat::Vcf,
517            contigs_found: query.contigs.len(),
518            contigs_merged: 0,
519            aliases_added: 0,
520        };
521
522        for contig in query.contigs {
523            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
524            if merged {
525                record.contigs_merged += 1;
526            }
527            record.aliases_added += aliases;
528        }
529
530        self.inputs_processed.push(record);
531        Ok(())
532    }
533
534    fn add_tsv(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
535        let content = std::fs::read_to_string(path)?;
536        let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
537            .map_err(|e| BuilderError::Parse(e.to_string()))?;
538
539        let mut record = InputRecord {
540            path: path_str.to_string(),
541            format: InputFormat::Tsv,
542            contigs_found: query.contigs.len(),
543            contigs_merged: 0,
544            aliases_added: 0,
545        };
546
547        for contig in query.contigs {
548            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
549            if merged {
550                record.contigs_merged += 1;
551            }
552            record.aliases_added += aliases;
553        }
554
555        self.inputs_processed.push(record);
556        Ok(())
557    }
558
559    fn add_fasta(&mut self, path: &Path, path_str: &str) -> Result<(), BuilderError> {
560        let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
561            .map_err(|e| BuilderError::Parse(e.to_string()))?;
562
563        let mut record = InputRecord {
564            path: path_str.to_string(),
565            format: InputFormat::Fasta,
566            contigs_found: query.contigs.len(),
567            contigs_merged: 0,
568            aliases_added: 0,
569        };
570
571        for contig in query.contigs {
572            let (merged, aliases) = self.merge_contig(&contig, path_str)?;
573            if merged {
574                record.contigs_merged += 1;
575            }
576            record.aliases_added += aliases;
577        }
578
579        self.inputs_processed.push(record);
580        Ok(())
581    }
582
583    /// Check an optional digest field for conflicts, merging if no conflict.
584    /// Validates the incoming digest format using the provided validator function.
585    /// Returns `Err` with a conflict message if values differ or validation fails,
586    /// or `Ok(())` if merged/skipped.
587    fn check_digest_conflict(
588        existing: &mut Option<String>,
589        incoming: Option<&String>,
590        field_name: &str,
591        contig_name: &str,
592        source: &str,
593        conflicts: &mut Vec<String>,
594        validator: fn(&str) -> bool,
595    ) -> Result<(), BuilderError> {
596        if let Some(new_val) = incoming {
597            if !validator(new_val) {
598                let msg = format!(
599                    "Invalid {field_name} for '{contig_name}': '{new_val}' (from {source})"
600                );
601                return Err(BuilderError::Validation(msg));
602            }
603            if let Some(existing_val) = existing.as_ref() {
604                if existing_val != new_val {
605                    let msg = format!(
606                        "{field_name} conflict for '{contig_name}': {existing_val} vs {new_val} (from {source})"
607                    );
608                    conflicts.push(msg.clone());
609                    return Err(BuilderError::Conflict(msg));
610                }
611            } else {
612                *existing = Some(new_val.clone());
613            }
614        }
615        Ok(())
616    }
617
618    /// Find the primary name for a contig, checking both its name and aliases.
619    fn find_primary_for_contig(&self, contig: &Contig) -> Option<String> {
620        std::iter::once(&contig.name)
621            .chain(contig.aliases.iter())
622            .find_map(|name| self.find_existing_primary(name))
623    }
624
625    /// Merge a contig into the builder.
626    /// Returns (`was_merged_into_existing`, `num_aliases_added`)
627    fn merge_contig(
628        &mut self,
629        contig: &Contig,
630        source: &str,
631    ) -> Result<(bool, usize), BuilderError> {
632        if let Some(primary) = self.find_primary_for_contig(contig) {
633            // Merge into existing
634            let metadata = self.contigs.get_mut(&primary).unwrap();
635            let aliases_before = metadata.aliases.len();
636
637            // Check length conflict
638            if let Some(existing_len) = metadata.length {
639                if existing_len != contig.length {
640                    let msg = format!(
641                        "Length conflict for '{}': {} vs {} (from {})",
642                        contig.name, existing_len, contig.length, source
643                    );
644                    self.conflicts.push(msg.clone());
645                    return Err(BuilderError::Conflict(msg));
646                }
647            } else {
648                metadata.length = Some(contig.length);
649            }
650
651            // Check digest conflicts
652            Self::check_digest_conflict(
653                &mut metadata.md5,
654                contig.md5.as_ref(),
655                "MD5",
656                &contig.name,
657                source,
658                &mut self.conflicts,
659                is_valid_md5,
660            )?;
661            Self::check_digest_conflict(
662                &mut metadata.sha512t24u,
663                contig.sha512t24u.as_ref(),
664                "sha512t24u",
665                &contig.name,
666                source,
667                &mut self.conflicts,
668                is_valid_sha512t24u,
669            )?;
670
671            // Merge aliases
672            for alias in &contig.aliases {
673                if !metadata.aliases.contains(alias) && alias != &metadata.primary_name {
674                    // Check alias doesn't conflict with another contig
675                    if let Some(other_primary) = self.alias_to_primary.get(alias) {
676                        if other_primary != &primary {
677                            self.warnings.push(format!(
678                                "Alias '{alias}' already mapped to '{other_primary}', skipping for '{primary}'"
679                            ));
680                            continue;
681                        }
682                    }
683                    metadata.aliases.insert(alias.clone());
684                    self.alias_to_primary.insert(alias.clone(), primary.clone());
685                }
686            }
687
688            // Also add the contig's name as an alias if different from primary
689            if contig.name != primary && !metadata.aliases.contains(&contig.name) {
690                metadata.aliases.insert(contig.name.clone());
691                self.alias_to_primary
692                    .insert(contig.name.clone(), primary.clone());
693            }
694
695            // Fill other fields
696            if metadata.assembly.is_none() && contig.assembly.is_some() {
697                metadata.assembly.clone_from(&contig.assembly);
698            }
699            if metadata.uri.is_none() && contig.uri.is_some() {
700                metadata.uri.clone_from(&contig.uri);
701            }
702            if metadata.species.is_none() && contig.species.is_some() {
703                metadata.species.clone_from(&contig.species);
704            }
705            // Update sequence role if not already set
706            if matches!(metadata.sequence_role, SequenceRole::Unknown) {
707                metadata.sequence_role = contig.sequence_role;
708            }
709
710            metadata.sources.push(source.to_string());
711
712            let aliases_added = metadata.aliases.len() - aliases_before;
713            Ok((true, aliases_added))
714        } else {
715            // Create new entry
716            let mut metadata = ContigMetadata::new(contig.name.clone());
717            metadata.length = Some(contig.length);
718            metadata.md5.clone_from(&contig.md5);
719            metadata.sha512t24u.clone_from(&contig.sha512t24u);
720            metadata.assembly.clone_from(&contig.assembly);
721            metadata.uri.clone_from(&contig.uri);
722            metadata.species.clone_from(&contig.species);
723            metadata.sequence_role = contig.sequence_role;
724            metadata.sources.push(source.to_string());
725
726            // Add aliases
727            let mut aliases_added = 0;
728            for alias in &contig.aliases {
729                if alias != &contig.name {
730                    if let Some(other_primary) = self.alias_to_primary.get(alias) {
731                        self.warnings.push(format!(
732                            "Alias '{}' already mapped to '{}', skipping for '{}'",
733                            alias, other_primary, contig.name
734                        ));
735                        continue;
736                    }
737                    metadata.aliases.insert(alias.clone());
738                    self.alias_to_primary
739                        .insert(alias.clone(), contig.name.clone());
740                    aliases_added += 1;
741                }
742            }
743
744            self.contig_order.push(contig.name.clone());
745            self.contigs.insert(contig.name.clone(), metadata);
746
747            Ok((false, aliases_added))
748        }
749    }
750
751    /// Find the primary name for a contig name (checks exact match and aliases)
752    fn find_existing_primary(&self, name: &str) -> Option<String> {
753        // Check exact name match
754        if self.contigs.contains_key(name) {
755            return Some(name.to_string());
756        }
757
758        // Check alias match
759        if let Some(primary) = self.alias_to_primary.get(name) {
760            return Some(primary.clone());
761        }
762
763        None
764    }
765
766    /// Build the final `KnownReference`
767    ///
768    /// # Errors
769    ///
770    /// Returns `BuilderError::MissingField` if no contigs were added or required
771    /// fields are missing.
772    pub fn build(self) -> Result<KnownReference, BuilderError> {
773        // Validate
774        if self.contigs.is_empty() {
775            return Err(BuilderError::MissingField("No contigs added".to_string()));
776        }
777
778        // Check for missing lengths
779        let mut missing_length = Vec::new();
780        for name in &self.contig_order {
781            if let Some(metadata) = self.contigs.get(name) {
782                if metadata.length.is_none() {
783                    missing_length.push(name.clone());
784                }
785            }
786        }
787        if !missing_length.is_empty() {
788            return Err(BuilderError::MissingField(format!(
789                "Missing length for contigs: {missing_length:?}"
790            )));
791        }
792
793        // Build contigs in order
794        let mut contigs: Vec<Contig> = self
795            .contig_order
796            .iter()
797            .filter_map(|name| self.contigs.get(name))
798            .filter_map(ContigMetadata::to_contig)
799            .collect();
800
801        // Override per-contig URIs with the download URL if provided.
802        // Dict files embed local filesystem paths in the UR tag which should not
803        // leak into the catalog when a remote download URL is specified.
804        if let Some(ref url) = self.download_url {
805            for contig in &mut contigs {
806                contig.uri = Some(url.clone());
807            }
808        }
809
810        // Override per-contig assembly fields with the builder's assembly value.
811        // Dict files may embed inconsistent or missing AS tags; the CLI --assembly
812        // flag should be the authoritative source.
813        if let Some(ref assembly) = self.assembly {
814            let assembly_str = assembly.to_string();
815            for contig in &mut contigs {
816                contig.assembly = Some(assembly_str.clone());
817            }
818        }
819
820        // Override per-contig species fields with the builder's species value.
821        // Dict files may embed inconsistent SP tags; the CLI --species flag should
822        // be the authoritative source when provided.
823        if let Some(ref species) = self.species {
824            for contig in &mut contigs {
825                contig.species = Some(species.clone());
826            }
827        }
828
829        // Find contigs that are in assembly report but missing MD5 (i.e., not in FASTA)
830        // These are typically assembled-molecule contigs that use external references (like MT in CHM13)
831        let mut contigs_missing_from_fasta: Vec<String> = Vec::new();
832        for (name, meta) in &self.contigs {
833            if meta.md5.is_none() && matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
834                contigs_missing_from_fasta.push(name.clone());
835            }
836        }
837        contigs_missing_from_fasta.sort();
838
839        // Determine assembly if not set
840        let assembly = self
841            .assembly
842            .unwrap_or_else(|| detect_assembly_from_name(&self.display_name));
843
844        // Determine source if not set
845        let source = self
846            .source
847            .unwrap_or(ReferenceSource::Custom("Unknown".to_string()));
848
849        // Determine naming convention
850        let naming_convention = crate::core::contig::detect_naming_convention(&contigs);
851
852        let mut reference = KnownReference {
853            id: crate::core::types::ReferenceId::new(&self.id),
854            display_name: self.display_name,
855            assembly,
856            source,
857            naming_convention,
858            download_url: self.download_url,
859            assembly_report_url: self.assembly_report_url,
860            contigs,
861            description: self.description,
862            tags: self.tags,
863            contigs_missing_from_fasta,
864            md5_set: HashSet::new(),
865            sha512t24u_set: HashSet::new(),
866            name_length_set: HashSet::new(),
867            signature: None,
868        };
869
870        reference.rebuild_indexes();
871        Ok(reference)
872    }
873
874    /// Get summary of build
875    #[must_use]
876    pub fn summary(&self) -> BuildSummary {
877        let total_contigs = self.contigs.len();
878        let with_length = self.contigs.values().filter(|m| m.length.is_some()).count();
879        let with_md5 = self.contigs.values().filter(|m| m.md5.is_some()).count();
880        let with_aliases = self
881            .contigs
882            .values()
883            .filter(|m| !m.aliases.is_empty())
884            .count();
885
886        // Collect assembled-molecule contigs missing MD5 (these are important)
887        let mut missing_md5_assembled: Vec<String> = Vec::new();
888        for (name, meta) in &self.contigs {
889            if meta.md5.is_none() {
890                // Check if it's an assembled-molecule (primary chromosome)
891                if matches!(meta.sequence_role, SequenceRole::AssembledMolecule) {
892                    missing_md5_assembled.push(name.clone());
893                }
894            }
895        }
896
897        // Count primary chromosomes
898        let primary_count = self
899            .contigs
900            .keys()
901            .filter(|name| {
902                let contig = Contig::new(name.as_str(), 0);
903                contig.is_primary_chromosome()
904            })
905            .count();
906
907        // Count ALT contigs
908        let alt_count = self
909            .contigs
910            .keys()
911            .filter(|name| name.ends_with("_alt") || name.contains("_alt_"))
912            .count();
913
914        BuildSummary {
915            id: self.id.clone(),
916            display_name: self.display_name.clone(),
917            assembly: self.assembly.clone(),
918            source: self.source.clone(),
919            inputs: self.inputs_processed.clone(),
920            total_contigs,
921            primary_chromosomes: primary_count,
922            alt_contigs: alt_count,
923            other_contigs: total_contigs.saturating_sub(primary_count + alt_count),
924            with_length,
925            with_md5,
926            with_aliases,
927            missing_md5_assembled,
928            conflicts: self.conflicts.clone(),
929            warnings: self.warnings.clone(),
930        }
931    }
932}
933
934/// Summary of the build process
935#[derive(Debug, Clone)]
936pub struct BuildSummary {
937    pub id: String,
938    pub display_name: String,
939    pub assembly: Option<Assembly>,
940    pub source: Option<ReferenceSource>,
941    pub inputs: Vec<InputRecord>,
942    pub total_contigs: usize,
943    pub primary_chromosomes: usize,
944    pub alt_contigs: usize,
945    pub other_contigs: usize,
946    pub with_length: usize,
947    pub with_md5: usize,
948    pub with_aliases: usize,
949    pub missing_md5_assembled: Vec<String>,
950    pub conflicts: Vec<String>,
951    pub warnings: Vec<String>,
952}
953
954impl std::fmt::Display for BuildSummary {
955    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
956        writeln!(f, "Reference Builder Summary")?;
957        writeln!(f, "=========================")?;
958        writeln!(f, "ID:       {}", self.id)?;
959        writeln!(f, "Name:     {}", self.display_name)?;
960        if let Some(ref assembly) = self.assembly {
961            writeln!(f, "Assembly: {assembly:?}")?;
962        }
963        if let Some(ref source) = self.source {
964            writeln!(f, "Source:   {source:?}")?;
965        }
966        writeln!(f)?;
967
968        writeln!(f, "Inputs:")?;
969        for (i, input) in self.inputs.iter().enumerate() {
970            writeln!(
971                f,
972                "  [{}] {} ({:?}) -> {} contigs, {} merged, {} aliases",
973                i + 1,
974                input.path,
975                input.format,
976                input.contigs_found,
977                input.contigs_merged,
978                input.aliases_added
979            )?;
980        }
981        writeln!(f)?;
982
983        writeln!(f, "Contigs: {} total", self.total_contigs)?;
984        writeln!(f, "  - Primary chromosomes: {}", self.primary_chromosomes)?;
985        writeln!(f, "  - ALT contigs: {}", self.alt_contigs)?;
986        writeln!(f, "  - Other: {}", self.other_contigs)?;
987        writeln!(f)?;
988
989        writeln!(f, "Coverage:")?;
990        let pct = |n: usize, total: usize| {
991            if total == 0 {
992                0
993            } else {
994                (n * 100) / total
995            }
996        };
997        let check = |n: usize, total: usize| {
998            if n == total {
999                "+"
1000            } else {
1001                "o"
1002            }
1003        };
1004        writeln!(
1005            f,
1006            "  {} Length:  {}/{} ({}%)",
1007            check(self.with_length, self.total_contigs),
1008            self.with_length,
1009            self.total_contigs,
1010            pct(self.with_length, self.total_contigs)
1011        )?;
1012        writeln!(
1013            f,
1014            "  {} MD5:     {}/{} ({}%)",
1015            check(self.with_md5, self.total_contigs),
1016            self.with_md5,
1017            self.total_contigs,
1018            pct(self.with_md5, self.total_contigs)
1019        )?;
1020        writeln!(
1021            f,
1022            "  {} Aliases: {}/{} ({}%)",
1023            check(self.with_aliases, self.total_contigs),
1024            self.with_aliases,
1025            self.total_contigs,
1026            pct(self.with_aliases, self.total_contigs)
1027        )?;
1028        writeln!(f)?;
1029
1030        writeln!(f, "Conflicts: {}", self.conflicts.len())?;
1031        for conflict in &self.conflicts {
1032            writeln!(f, "  - {conflict}")?;
1033        }
1034
1035        let total_warnings = self.warnings.len() + self.missing_md5_assembled.len();
1036        writeln!(f, "Warnings: {total_warnings}")?;
1037        for warning in &self.warnings {
1038            writeln!(f, "  - {warning}")?;
1039        }
1040        if !self.missing_md5_assembled.is_empty() {
1041            writeln!(
1042                f,
1043                "  - Missing MD5 for assembled-molecule contigs: {}",
1044                self.missing_md5_assembled.join(", ")
1045            )?;
1046        }
1047
1048        Ok(())
1049    }
1050}
1051
1052/// Builder for creating `FastaDistribution` from multiple input files
1053///
1054/// This builder merges contig metadata from multiple sources (dict, VCF, SAM/BAM, etc.)
1055/// keyed by (name, length). It handles:
1056/// - MD5 merging (first non-empty wins, error on conflict)
1057/// - Alias merging (union of all)
1058/// - Sort order preservation (first seen order)
1059#[derive(Debug)]
1060pub struct DistributionBuilder {
1061    id: String,
1062    display_name: String,
1063    source: ReferenceSource,
1064    download_url: Option<String>,
1065    tags: Vec<String>,
1066
1067    /// Contigs keyed by (name, length) for merging
1068    contigs: HashMap<(String, u64), FastaContig>,
1069
1070    /// Track insertion order
1071    insertion_order: Vec<(String, u64)>,
1072
1073    /// Source files processed
1074    source_files: Vec<String>,
1075
1076    /// Whether to generate UCSC-style names for patches (see [`ReferenceBuilder`])
1077    generate_ucsc_names: bool,
1078}
1079
1080impl Default for DistributionBuilder {
1081    fn default() -> Self {
1082        Self::new("")
1083    }
1084}
1085
1086impl DistributionBuilder {
1087    /// Create a new builder with the given distribution ID
1088    pub fn new(id: impl Into<String>) -> Self {
1089        Self {
1090            id: id.into(),
1091            display_name: String::new(),
1092            source: ReferenceSource::Custom("custom".to_string()),
1093            download_url: None,
1094            tags: Vec::new(),
1095            contigs: HashMap::new(),
1096            insertion_order: Vec::new(),
1097            source_files: Vec::new(),
1098            generate_ucsc_names: true, // Default: generate UCSC names for patches
1099        }
1100    }
1101
1102    /// Configure whether to generate UCSC-style names for patches.
1103    ///
1104    /// See [`ReferenceBuilder::generate_ucsc_names`] for details.
1105    #[must_use]
1106    pub fn with_generate_ucsc_names(mut self, generate: bool) -> Self {
1107        self.generate_ucsc_names = generate;
1108        self
1109    }
1110
1111    /// Set the display name
1112    #[must_use]
1113    pub fn with_display_name(mut self, name: impl Into<String>) -> Self {
1114        self.display_name = name.into();
1115        self
1116    }
1117
1118    /// Set the reference source
1119    #[must_use]
1120    pub fn with_source(mut self, source: ReferenceSource) -> Self {
1121        self.source = source;
1122        self
1123    }
1124
1125    /// Set the download URL
1126    #[must_use]
1127    pub fn with_download_url(mut self, url: impl Into<String>) -> Self {
1128        self.download_url = Some(url.into());
1129        self
1130    }
1131
1132    /// Set tags
1133    #[must_use]
1134    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
1135        self.tags = tags;
1136        self
1137    }
1138
1139    /// Add an input file (auto-detect format)
1140    ///
1141    /// # Errors
1142    ///
1143    /// Returns `BuilderError::Parse` if format cannot be detected, or other
1144    /// errors from parsing the specific format.
1145    pub fn add_input(&mut self, path: &Path) -> Result<&mut Self, BuilderError> {
1146        let format = InputFormat::from_path(path).ok_or_else(|| {
1147            BuilderError::Parse(format!("Cannot detect format for: {}", path.display()))
1148        })?;
1149        self.add_input_with_format(path, format)
1150    }
1151
1152    /// Add an input file with explicit format
1153    ///
1154    /// # Errors
1155    ///
1156    /// Returns `BuilderError::Io` if the file cannot be read, `BuilderError::Parse`
1157    /// if parsing fails, or `BuilderError::Conflict` if contig data conflicts.
1158    pub fn add_input_with_format(
1159        &mut self,
1160        path: &Path,
1161        format: InputFormat,
1162    ) -> Result<&mut Self, BuilderError> {
1163        let path_str = path.display().to_string();
1164        self.source_files.push(path_str.clone());
1165
1166        let contigs = self.parse_input(path, format)?;
1167
1168        for contig in contigs {
1169            let key = (contig.name.clone(), contig.length);
1170            #[allow(clippy::cast_possible_truncation)] // Contig count limited by MAX_CONTIGS (50k)
1171            let fasta_contig = FastaContig {
1172                name: contig.name,
1173                length: contig.length,
1174                md5: contig.md5.unwrap_or_default(),
1175                sort_order: self.insertion_order.len() as u32,
1176                report_contig_id: None,
1177                aliases: contig.aliases,
1178            };
1179
1180            if let Some(existing) = self.contigs.get_mut(&key) {
1181                existing.merge(&fasta_contig)?;
1182            } else {
1183                self.insertion_order.push(key.clone());
1184                self.contigs.insert(key, fasta_contig);
1185            }
1186        }
1187
1188        Ok(self)
1189    }
1190
1191    /// Parse contigs from an input file
1192    fn parse_input(&self, path: &Path, format: InputFormat) -> Result<Vec<Contig>, BuilderError> {
1193        match format {
1194            InputFormat::Dict | InputFormat::Sam => {
1195                let content = std::fs::read_to_string(path)?;
1196                let query = crate::parsing::sam::parse_header_text(&content)
1197                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1198                Ok(query.contigs)
1199            }
1200            InputFormat::Bam | InputFormat::Cram => {
1201                let query = crate::parsing::sam::parse_file(path)
1202                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1203                Ok(query.contigs)
1204            }
1205            InputFormat::Fai => {
1206                let content = std::fs::read_to_string(path)?;
1207                let query = crate::parsing::fai::parse_fai_text(&content)
1208                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1209                Ok(query.contigs)
1210            }
1211            InputFormat::Fasta => {
1212                // Parse FASTA with MD5 computation
1213                let query = crate::parsing::fasta::parse_fasta_file_with_md5(path)
1214                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1215                Ok(query.contigs)
1216            }
1217            InputFormat::NcbiReport => {
1218                let content = std::fs::read_to_string(path)?;
1219                let entries = crate::parsing::ncbi_report::parse_ncbi_report_text(&content)
1220                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1221                // Use generate_ucsc_names option to control UCSC name generation for patches
1222                Ok(entries
1223                    .into_iter()
1224                    .map(|e| e.to_contig_with_options(self.generate_ucsc_names))
1225                    .collect())
1226            }
1227            InputFormat::Vcf => {
1228                let content = std::fs::read_to_string(path)?;
1229                let query = crate::parsing::vcf::parse_vcf_header_text(&content)
1230                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1231                Ok(query.contigs)
1232            }
1233            InputFormat::Tsv => {
1234                let content = std::fs::read_to_string(path)?;
1235                let query = crate::parsing::tsv::parse_tsv_text(&content, '\t')
1236                    .map_err(|e| BuilderError::Parse(e.to_string()))?;
1237                Ok(query.contigs)
1238            }
1239        }
1240    }
1241
1242    /// Build the `FastaDistribution`
1243    ///
1244    /// # Errors
1245    ///
1246    /// Returns `BuilderError::MissingField` if no contigs were found.
1247    pub fn build(self) -> Result<FastaDistribution, BuilderError> {
1248        if self.contigs.is_empty() {
1249            return Err(BuilderError::MissingField("No contigs found".to_string()));
1250        }
1251
1252        // Build contigs in insertion order
1253        let mut contigs: Vec<FastaContig> = Vec::with_capacity(self.insertion_order.len());
1254        for (i, key) in self.insertion_order.iter().enumerate() {
1255            if let Some(mut contig) = self.contigs.get(key).cloned() {
1256                #[allow(clippy::cast_possible_truncation)] // Contig count limited
1257                {
1258                    contig.sort_order = i as u32;
1259                }
1260                contigs.push(contig);
1261            }
1262        }
1263
1264        Ok(FastaDistribution {
1265            id: self.id,
1266            display_name: self.display_name,
1267            source: self.source,
1268            download_url: self.download_url,
1269            tags: self.tags,
1270            contigs,
1271        })
1272    }
1273}
1274
1275/// Detect assembly version from display name
1276#[must_use]
1277pub fn detect_assembly_from_name(display_name: &str) -> Assembly {
1278    let lower = display_name.to_lowercase();
1279    if lower.contains("chm13") || lower.contains("t2t") {
1280        Assembly::Other("CHM13".to_string())
1281    } else if lower.contains("38") || lower.contains("hg38") || lower.contains("hs38") {
1282        Assembly::Grch38
1283    } else if lower.contains("37")
1284        || lower.contains("19")
1285        || lower.contains("hg19")
1286        || lower.contains("hs37")
1287        || lower.contains("b37")
1288    {
1289        Assembly::Grch37
1290    } else {
1291        Assembly::Other(display_name.to_string())
1292    }
1293}
1294
1295#[cfg(test)]
1296mod tests {
1297    use super::*;
1298
1299    #[test]
1300    fn test_input_format_detection() {
1301        assert_eq!(
1302            InputFormat::from_path(Path::new("test.dict")),
1303            Some(InputFormat::Dict)
1304        );
1305        assert_eq!(
1306            InputFormat::from_path(Path::new("test.fai")),
1307            Some(InputFormat::Fai)
1308        );
1309        assert_eq!(
1310            InputFormat::from_path(Path::new("test.fa")),
1311            Some(InputFormat::Fasta)
1312        );
1313        assert_eq!(
1314            InputFormat::from_path(Path::new("test.fasta")),
1315            Some(InputFormat::Fasta)
1316        );
1317        assert_eq!(
1318            InputFormat::from_path(Path::new("test.fna")),
1319            Some(InputFormat::Fasta)
1320        );
1321        assert_eq!(
1322            InputFormat::from_path(Path::new("test.fa.gz")),
1323            Some(InputFormat::Fasta)
1324        );
1325        assert_eq!(
1326            InputFormat::from_path(Path::new("test.vcf")),
1327            Some(InputFormat::Vcf)
1328        );
1329        assert_eq!(
1330            InputFormat::from_path(Path::new("test.vcf.gz")),
1331            Some(InputFormat::Vcf)
1332        );
1333        assert_eq!(
1334            InputFormat::from_path(Path::new("GRCh38_assembly_report.txt")),
1335            Some(InputFormat::NcbiReport)
1336        );
1337    }
1338
1339    #[test]
1340    fn test_detect_assembly() {
1341        assert!(matches!(
1342            detect_assembly_from_name("GRCh38 (Broad)"),
1343            Assembly::Grch38
1344        ));
1345        assert!(matches!(
1346            detect_assembly_from_name("hg38 UCSC"),
1347            Assembly::Grch38
1348        ));
1349        assert!(matches!(
1350            detect_assembly_from_name("GRCh37"),
1351            Assembly::Grch37
1352        ));
1353        assert!(matches!(
1354            detect_assembly_from_name("hg19"),
1355            Assembly::Grch37
1356        ));
1357        assert!(matches!(
1358            detect_assembly_from_name("T2T-CHM13"),
1359            Assembly::Other(_)
1360        ));
1361    }
1362
1363    #[test]
1364    fn test_builder_single_input() {
1365        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1366            .assembly(Assembly::Grch38)
1367            .source(ReferenceSource::Custom("test".to_string()));
1368
1369        // Manually add a contig
1370        let contig = Contig::new("chr1", 248_956_422);
1371        builder.merge_contig(&contig, "test").unwrap();
1372
1373        let reference = builder.build().unwrap();
1374        assert_eq!(reference.id.0, "test_ref");
1375        assert_eq!(reference.contigs.len(), 1);
1376        assert_eq!(reference.contigs[0].name, "chr1");
1377    }
1378
1379    #[test]
1380    fn test_builder_merge_with_aliases() {
1381        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1382
1383        // Add first contig
1384        let contig1 = Contig::new("chr1", 248_956_422);
1385        builder.merge_contig(&contig1, "source1").unwrap();
1386
1387        // Add same contig with different name (as alias)
1388        let mut contig2 = Contig::new("1", 248_956_422);
1389        contig2.aliases = vec!["chr1".to_string()];
1390        builder.merge_contig(&contig2, "source2").unwrap();
1391
1392        // Should have merged into one entry
1393        let summary = builder.summary();
1394        assert_eq!(summary.total_contigs, 1);
1395
1396        let reference = builder.build().unwrap();
1397        assert_eq!(reference.contigs.len(), 1);
1398        // The alias "1" should be in aliases
1399        assert!(reference.contigs[0].aliases.contains(&"1".to_string()));
1400    }
1401
1402    #[test]
1403    fn test_builder_conflict_detection() {
1404        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference");
1405
1406        // Add first contig
1407        let contig1 = Contig::new("chr1", 248_956_422);
1408        builder.merge_contig(&contig1, "source1").unwrap();
1409
1410        // Add conflicting contig (different length, same name)
1411        let contig2 = Contig::new("chr1", 100_000);
1412        let result = builder.merge_contig(&contig2, "source2");
1413        assert!(result.is_err());
1414        assert!(matches!(result.unwrap_err(), BuilderError::Conflict(_)));
1415    }
1416
1417    #[test]
1418    fn test_distribution_builder_single_dict() {
1419        use std::io::Write;
1420        use tempfile::NamedTempFile;
1421
1422        let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1423        writeln!(file, "@HD\tVN:1.6").unwrap();
1424        writeln!(
1425            file,
1426            "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1427        )
1428        .unwrap();
1429        writeln!(
1430            file,
1431            "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c"
1432        )
1433        .unwrap();
1434
1435        let mut builder = DistributionBuilder::new("test_ref");
1436        builder.add_input(file.path()).unwrap();
1437        let dist = builder.build().unwrap();
1438
1439        assert_eq!(dist.contigs.len(), 2);
1440        assert_eq!(dist.contigs[0].name, "chr1");
1441        assert_eq!(dist.contigs[0].md5, "6aef897c3d6ff0c78aff06ac189178dd");
1442        assert_eq!(dist.contigs[1].name, "chr2");
1443        assert_eq!(dist.contigs[1].md5, "f98db672eb0993dcfdabafe2a882905c");
1444    }
1445
1446    #[test]
1447    fn test_distribution_builder_merges_inputs() {
1448        use std::io::Write;
1449        use tempfile::NamedTempFile;
1450
1451        // Dict without MD5
1452        let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1453        writeln!(dict, "@HD\tVN:1.6").unwrap();
1454        writeln!(dict, "@SQ\tSN:chr1\tLN:1000").unwrap();
1455
1456        // VCF with MD5
1457        let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1458        writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1459        writeln!(
1460            vcf,
1461            "##contig=<ID=chr1,length=1000,md5=6aef897c3d6ff0c78aff06ac189178dd>"
1462        )
1463        .unwrap();
1464        writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1465
1466        let mut builder = DistributionBuilder::new("test_ref");
1467        builder.add_input(dict.path()).unwrap();
1468        builder.add_input(vcf.path()).unwrap();
1469        let distribution = builder.build().unwrap();
1470
1471        assert_eq!(distribution.contigs.len(), 1);
1472        assert_eq!(
1473            distribution.contigs[0].md5,
1474            "6aef897c3d6ff0c78aff06ac189178dd"
1475        );
1476    }
1477
1478    #[test]
1479    fn test_distribution_builder_md5_conflict() {
1480        use std::io::Write;
1481        use tempfile::NamedTempFile;
1482
1483        let mut dict = NamedTempFile::with_suffix(".dict").unwrap();
1484        writeln!(dict, "@HD\tVN:1.6").unwrap();
1485        writeln!(
1486            dict,
1487            "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1488        )
1489        .unwrap();
1490
1491        let mut vcf = NamedTempFile::with_suffix(".vcf").unwrap();
1492        writeln!(vcf, "##fileformat=VCFv4.2").unwrap();
1493        writeln!(
1494            vcf,
1495            "##contig=<ID=chr1,length=1000,md5=f98db672eb0993dcfdabafe2a882905c>"
1496        )
1497        .unwrap();
1498        writeln!(vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO").unwrap();
1499
1500        let mut builder = DistributionBuilder::new("test_ref");
1501        builder.add_input(dict.path()).unwrap();
1502        let result = builder.add_input(vcf.path());
1503
1504        assert!(matches!(result, Err(BuilderError::Merge(_))));
1505    }
1506
1507    #[test]
1508    fn test_builder_download_url_overrides_contig_uri() {
1509        let local_uri = "file:///local/path/to/ref.fasta";
1510        let download_url = "https://example.com/ref.fasta";
1511
1512        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1513            .assembly(Assembly::Grch38)
1514            .source(ReferenceSource::Custom("test".to_string()))
1515            .download_url(download_url);
1516
1517        // Manually add contigs with local URIs (as a dict file would produce)
1518        let mut contig1 = Contig::new("chr1", 248_956_422);
1519        contig1.uri = Some(local_uri.to_string());
1520        builder.merge_contig(&contig1, "test").unwrap();
1521
1522        let mut contig2 = Contig::new("chr2", 242_193_529);
1523        contig2.uri = Some(local_uri.to_string());
1524        builder.merge_contig(&contig2, "test").unwrap();
1525
1526        let reference = builder.build().unwrap();
1527
1528        // The download_url should be set on the reference
1529        assert_eq!(reference.download_url.as_deref(), Some(download_url));
1530
1531        // Each contig's uri should be the download URL, not the local path
1532        for contig in &reference.contigs {
1533            assert_eq!(
1534                contig.uri.as_deref(),
1535                Some(download_url),
1536                "contig {} uri should be the download URL, not the local path",
1537                contig.name
1538            );
1539        }
1540    }
1541
1542    #[test]
1543    fn test_builder_no_download_url_preserves_contig_uri() {
1544        let local_uri = "file:///local/path/to/ref.fasta";
1545
1546        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1547            .assembly(Assembly::Grch38)
1548            .source(ReferenceSource::Custom("test".to_string()));
1549        // No download_url set
1550
1551        let mut contig = Contig::new("chr1", 248_956_422);
1552        contig.uri = Some(local_uri.to_string());
1553        builder.merge_contig(&contig, "test").unwrap();
1554
1555        let reference = builder.build().unwrap();
1556
1557        // Without download_url, the original local URI should be preserved
1558        assert_eq!(reference.contigs[0].uri.as_deref(), Some(local_uri));
1559    }
1560
1561    #[test]
1562    fn test_builder_download_url_overrides_dict_file_uri() {
1563        use std::io::Write;
1564        use tempfile::NamedTempFile;
1565
1566        let download_url = "https://storage.googleapis.com/bucket/ref.fasta";
1567
1568        // Create a dict file with a local UR tag (as real dict files have)
1569        let mut dict_file = NamedTempFile::with_suffix(".dict").unwrap();
1570        writeln!(dict_file, "@HD\tVN:1.6").unwrap();
1571        writeln!(
1572            dict_file,
1573            "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd\tUR:file:///local/ref.fa"
1574        )
1575        .unwrap();
1576        writeln!(
1577            dict_file,
1578            "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c\tUR:file:///local/ref.fa"
1579        )
1580        .unwrap();
1581
1582        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1583            .assembly(Assembly::Grch38)
1584            .source(ReferenceSource::Custom("test".to_string()))
1585            .download_url(download_url);
1586
1587        builder.add_input(dict_file.path()).unwrap();
1588
1589        let reference = builder.build().unwrap();
1590
1591        assert_eq!(reference.download_url.as_deref(), Some(download_url));
1592        for contig in &reference.contigs {
1593            assert_eq!(
1594                contig.uri.as_deref(),
1595                Some(download_url),
1596                "contig {} uri should be overridden by download_url",
1597                contig.name
1598            );
1599        }
1600    }
1601
1602    #[test]
1603    fn test_distribution_builder_preserves_order() {
1604        use std::io::Write;
1605        use tempfile::NamedTempFile;
1606
1607        let mut file = NamedTempFile::with_suffix(".dict").unwrap();
1608        writeln!(file, "@HD\tVN:1.6").unwrap();
1609        writeln!(
1610            file,
1611            "@SQ\tSN:chrM\tLN:16569\tM5:d2ed829b8a1628d16cbeee88e88e39eb"
1612        )
1613        .unwrap();
1614        writeln!(
1615            file,
1616            "@SQ\tSN:chr1\tLN:248956422\tM5:6aef897c3d6ff0c78aff06ac189178dd"
1617        )
1618        .unwrap();
1619        writeln!(
1620            file,
1621            "@SQ\tSN:chr2\tLN:242193529\tM5:f98db672eb0993dcfdabafe2a882905c"
1622        )
1623        .unwrap();
1624
1625        let mut builder = DistributionBuilder::new("test_ref");
1626        builder.add_input(file.path()).unwrap();
1627        let dist = builder.build().unwrap();
1628
1629        assert_eq!(dist.contigs[0].name, "chrM");
1630        assert_eq!(dist.contigs[0].sort_order, 0);
1631        assert_eq!(dist.contigs[1].name, "chr1");
1632        assert_eq!(dist.contigs[1].sort_order, 1);
1633        assert_eq!(dist.contigs[2].name, "chr2");
1634        assert_eq!(dist.contigs[2].sort_order, 2);
1635    }
1636
1637    #[test]
1638    fn test_builder_assembly_overrides_contig_assembly() {
1639        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1640            .assembly(Assembly::Grch38)
1641            .source(ReferenceSource::Custom("test".to_string()));
1642
1643        // Add contigs with existing assembly fields (as dict AS tags would produce)
1644        let mut contig1 = Contig::new("chr1", 248_956_422);
1645        contig1.assembly = Some("hg38".to_string());
1646        builder.merge_contig(&contig1, "test").unwrap();
1647
1648        let mut contig2 = Contig::new("chr2", 242_193_529);
1649        contig2.assembly = Some("hg38".to_string());
1650        builder.merge_contig(&contig2, "test").unwrap();
1651
1652        let reference = builder.build().unwrap();
1653
1654        // Each contig's assembly should be the builder's Assembly Display value
1655        for contig in &reference.contigs {
1656            assert_eq!(
1657                contig.assembly.as_deref(),
1658                Some("GRCh38"),
1659                "contig {} assembly should be overridden by builder assembly",
1660                contig.name
1661            );
1662        }
1663    }
1664
1665    #[test]
1666    fn test_builder_no_assembly_preserves_contig_assembly() {
1667        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1668            .source(ReferenceSource::Custom("test".to_string()));
1669        // No assembly set on builder
1670
1671        let mut contig = Contig::new("chr1", 248_956_422);
1672        contig.assembly = Some("hg38".to_string());
1673        builder.merge_contig(&contig, "test").unwrap();
1674
1675        let reference = builder.build().unwrap();
1676
1677        // Without builder assembly, the original contig assembly should be preserved
1678        assert_eq!(reference.contigs[0].assembly.as_deref(), Some("hg38"));
1679    }
1680
1681    #[test]
1682    fn test_builder_species_overrides_contig_species() {
1683        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1684            .assembly(Assembly::Grch38)
1685            .source(ReferenceSource::Custom("test".to_string()))
1686            .species("Homo sapiens");
1687
1688        // Add contigs with existing species fields (as dict SP tags would produce)
1689        let mut contig1 = Contig::new("chr1", 248_956_422);
1690        contig1.species = Some("Human".to_string());
1691        builder.merge_contig(&contig1, "test").unwrap();
1692
1693        let contig2 = Contig::new("chr2", 242_193_529);
1694        // No species set on this contig
1695        builder.merge_contig(&contig2, "test").unwrap();
1696
1697        let reference = builder.build().unwrap();
1698
1699        // Each contig's species should be the builder's species value
1700        for contig in &reference.contigs {
1701            assert_eq!(
1702                contig.species.as_deref(),
1703                Some("Homo sapiens"),
1704                "contig {} species should be overridden by builder species",
1705                contig.name
1706            );
1707        }
1708    }
1709
1710    #[test]
1711    fn test_builder_no_species_preserves_contig_species() {
1712        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1713            .assembly(Assembly::Grch38)
1714            .source(ReferenceSource::Custom("test".to_string()));
1715        // No species set on builder
1716
1717        let mut contig = Contig::new("chr1", 248_956_422);
1718        contig.species = Some("Human".to_string());
1719        builder.merge_contig(&contig, "test").unwrap();
1720
1721        let reference = builder.build().unwrap();
1722
1723        // Without builder species, the original contig species should be preserved
1724        assert_eq!(reference.contigs[0].species.as_deref(), Some("Human"));
1725    }
1726
1727    #[test]
1728    fn test_builder_assembly_overrides_dict_file_as_tag() {
1729        use std::io::Write;
1730        use tempfile::NamedTempFile;
1731
1732        // Create a dict file with AS and SP tags
1733        let mut dict_file = NamedTempFile::with_suffix(".dict").unwrap();
1734        writeln!(dict_file, "@HD\tVN:1.6").unwrap();
1735        writeln!(
1736            dict_file,
1737            "@SQ\tSN:chr1\tLN:1000\tM5:6aef897c3d6ff0c78aff06ac189178dd\tAS:hg38\tSP:Human"
1738        )
1739        .unwrap();
1740        writeln!(
1741            dict_file,
1742            "@SQ\tSN:chr2\tLN:2000\tM5:f98db672eb0993dcfdabafe2a882905c\tAS:hg38\tSP:Human"
1743        )
1744        .unwrap();
1745
1746        let mut builder = ReferenceBuilder::new("test_ref", "Test Reference")
1747            .assembly(Assembly::Grch38)
1748            .source(ReferenceSource::Custom("test".to_string()))
1749            .species("Homo sapiens");
1750
1751        builder.add_input(dict_file.path()).unwrap();
1752
1753        let reference = builder.build().unwrap();
1754
1755        for contig in &reference.contigs {
1756            assert_eq!(
1757                contig.assembly.as_deref(),
1758                Some("GRCh38"),
1759                "contig {} assembly should be overridden by builder assembly",
1760                contig.name
1761            );
1762            assert_eq!(
1763                contig.species.as_deref(),
1764                Some("Homo sapiens"),
1765                "contig {} species should be overridden by builder species",
1766                contig.name
1767            );
1768        }
1769    }
1770
1771    #[test]
1772    fn test_builder_chm13_assembly_display() {
1773        let mut builder = ReferenceBuilder::new("chm13", "T2T-CHM13v2.0")
1774            .assembly(Assembly::Other("T2T-CHM13v2.0".to_string()))
1775            .source(ReferenceSource::Custom("test".to_string()));
1776
1777        let contig = Contig::new("chr1", 248_387_328);
1778        builder.merge_contig(&contig, "test").unwrap();
1779
1780        let reference = builder.build().unwrap();
1781
1782        assert_eq!(
1783            reference.contigs[0].assembly.as_deref(),
1784            Some("T2T-CHM13v2.0")
1785        );
1786    }
1787}