Skip to main content

ref_solver/cli/
catalog.rs

1use std::path::PathBuf;
2
3use clap::{Args, Subcommand};
4
5use crate::catalog::builder::{InputFormat, ReferenceBuilder};
6use crate::catalog::hierarchical::HierarchicalCatalog;
7use crate::catalog::store::ReferenceCatalog;
8use crate::cli::OutputFormat;
9use crate::core::types::{Assembly, ReferenceSource};
10
11/// Helper function to convert usize count to f64 with explicit precision loss allowance
12#[inline]
13fn count_to_f64(count: usize) -> f64 {
14    #[allow(clippy::cast_precision_loss)]
15    {
16        count as f64
17    }
18}
19
20#[derive(Args)]
21pub struct CatalogArgs {
22    #[command(subcommand)]
23    pub command: CatalogCommands,
24}
25
26#[derive(Subcommand)]
27#[non_exhaustive]
28pub enum CatalogCommands {
29    /// List all references in the catalog
30    List {
31        /// Path to custom catalog file
32        #[arg(long)]
33        catalog: Option<PathBuf>,
34
35        /// Filter by assembly (e.g., "`GRCh38`")
36        #[arg(long)]
37        assembly: Option<String>,
38
39        /// Filter by source (e.g., "UCSC")
40        #[arg(long)]
41        source: Option<String>,
42    },
43
44    /// Show details of a specific reference
45    Show {
46        /// Reference ID
47        #[arg(required = true)]
48        id: String,
49
50        /// Path to custom catalog file
51        #[arg(long)]
52        catalog: Option<PathBuf>,
53
54        /// Show all contigs
55        #[arg(long)]
56        all_contigs: bool,
57    },
58
59    /// Export the catalog to a file
60    Export {
61        /// Output file path
62        #[arg(required = true)]
63        output: PathBuf,
64
65        /// Path to custom catalog file to export (defaults to embedded)
66        #[arg(long)]
67        catalog: Option<PathBuf>,
68    },
69
70    /// List hierarchical catalog contents (assemblies, versions, distributions)
71    ListHierarchical {
72        /// Path to hierarchical catalog file
73        #[arg(required = true)]
74        catalog: PathBuf,
75    },
76
77    /// Build a hierarchical catalog entry (`FastaDistribution`)
78    BuildHierarchical {
79        /// Distribution ID (e.g., "`hg38_custom`")
80        #[arg(long, required = true)]
81        id: String,
82
83        /// Display name (e.g., "hg38 Custom Build")
84        #[arg(long, required = true)]
85        name: String,
86
87        /// Input file(s) - can be specified multiple times
88        #[arg(short, long = "input", required = true, num_args = 1..)]
89        inputs: Vec<PathBuf>,
90
91        /// Assembly ID to attach to (e.g., "grch38")
92        #[arg(long)]
93        assembly_id: Option<String>,
94
95        /// Version ID to attach to (e.g., "`grch38_p14`")
96        #[arg(long)]
97        version_id: Option<String>,
98
99        /// Source organization (ucsc, ncbi, broad, ensembl, 1kg, dragen, gdc, or custom)
100        #[arg(long)]
101        source: Option<String>,
102
103        /// Reference FASTA download URL
104        #[arg(long)]
105        download_url: Option<String>,
106
107        /// Tags (comma-separated)
108        #[arg(long)]
109        tags: Option<String>,
110
111        /// Output file (creates new hierarchical catalog or standalone distribution JSON)
112        #[arg(short, long)]
113        output: Option<PathBuf>,
114
115        /// Append to existing hierarchical catalog
116        #[arg(long)]
117        append_to: Option<PathBuf>,
118
119        /// Overwrite if distribution ID already exists
120        #[arg(long)]
121        force: bool,
122
123        /// Require MD5 checksums for all contigs
124        #[arg(long)]
125        require_md5: bool,
126
127        /// Infer base assembly by matching MD5s against an existing catalog
128        /// If no path given, uses the embedded catalog (or --append-to catalog)
129        #[arg(long)]
130        #[allow(clippy::option_option)]
131        // Distinguishes: not present / present without value / present with value
132        infer_assembly: Option<Option<PathBuf>>,
133
134        /// Disable automatic generation of UCSC-style names for patches.
135        ///
136        /// By default, when parsing NCBI assembly reports, UCSC-style names are
137        /// generated for fix-patches and novel-patches that have "na" in the
138        /// UCSC-style-name column (common in reports prior to p13).
139        ///
140        /// Use this flag to disable this behavior and only use names explicitly
141        /// present in the assembly report.
142        ///
143        /// See: <https://genome.ucsc.edu/FAQ/FAQdownloads.html>
144        #[arg(long)]
145        no_generate_ucsc_names: bool,
146    },
147
148    /// Build a new reference entry from input files
149    Build {
150        /// Unique reference ID (e.g., "`grch38_custom`")
151        #[arg(long, required = true)]
152        id: String,
153
154        /// Display name (e.g., "`GRCh38` Custom Build")
155        #[arg(long, required = true)]
156        name: String,
157
158        /// Input file(s) - can be specified multiple times
159        /// Supported formats: .dict, .fai, .sam, .bam, .cram, .vcf, _`assembly_report.txt`
160        #[arg(short, long = "input", required = true, num_args = 1..)]
161        inputs: Vec<PathBuf>,
162
163        /// Assembly version (grch37, grch38, or custom name)
164        #[arg(long)]
165        assembly: Option<String>,
166
167        /// Source organization (ucsc, ncbi, broad, ensembl, illumina, 1kg, or custom)
168        #[arg(long)]
169        source: Option<String>,
170
171        /// Description text
172        #[arg(long)]
173        description: Option<String>,
174
175        /// Reference FASTA download URL
176        #[arg(long)]
177        download_url: Option<String>,
178
179        /// NCBI assembly report URL
180        #[arg(long)]
181        assembly_report_url: Option<String>,
182
183        /// Comma-separated tags
184        #[arg(long)]
185        tags: Option<String>,
186
187        /// Species name to set on all contigs (e.g., "Homo sapiens")
188        #[arg(long)]
189        species: Option<String>,
190
191        /// Output file (JSON). If not specified, prints to stdout
192        #[arg(short, long)]
193        output: Option<PathBuf>,
194
195        /// Append to existing catalog file
196        #[arg(long)]
197        append_to: Option<PathBuf>,
198
199        /// Force overwrite if ID already exists in catalog
200        #[arg(long)]
201        force: bool,
202
203        /// Force input format instead of auto-detection
204        #[arg(long, value_enum)]
205        input_format: Option<InputFormatArg>,
206
207        /// Error if any contig lacks MD5 checksum
208        #[arg(long)]
209        require_md5: bool,
210
211        /// Disable automatic generation of UCSC-style names for patches.
212        ///
213        /// By default, when parsing NCBI assembly reports, UCSC-style names are
214        /// generated for fix-patches and novel-patches that have "na" in the
215        /// UCSC-style-name column (common in reports prior to p13).
216        ///
217        /// Use this flag to disable this behavior and only use names explicitly
218        /// present in the assembly report.
219        ///
220        /// See: <https://genome.ucsc.edu/FAQ/FAQdownloads.html>
221        #[arg(long)]
222        no_generate_ucsc_names: bool,
223    },
224}
225
226/// Input format argument for CLI
227#[derive(Clone, Copy, Debug, clap::ValueEnum)]
228pub enum InputFormatArg {
229    Dict,
230    Fai,
231    Fasta,
232    NcbiReport,
233    Sam,
234    Bam,
235    Cram,
236    Vcf,
237    Tsv,
238}
239
240impl From<InputFormatArg> for InputFormat {
241    fn from(arg: InputFormatArg) -> Self {
242        match arg {
243            InputFormatArg::Dict => InputFormat::Dict,
244            InputFormatArg::Fai => InputFormat::Fai,
245            InputFormatArg::Fasta => InputFormat::Fasta,
246            InputFormatArg::NcbiReport => InputFormat::NcbiReport,
247            InputFormatArg::Sam => InputFormat::Sam,
248            InputFormatArg::Bam => InputFormat::Bam,
249            InputFormatArg::Cram => InputFormat::Cram,
250            InputFormatArg::Vcf => InputFormat::Vcf,
251            InputFormatArg::Tsv => InputFormat::Tsv,
252        }
253    }
254}
255
256/// Execute catalog subcommand
257///
258/// # Errors
259///
260/// Returns an error if the catalog cannot be loaded or the operation fails.
261pub fn run(args: CatalogArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
262    match args.command {
263        CatalogCommands::List {
264            catalog,
265            assembly,
266            source,
267        } => run_list(
268            catalog,
269            assembly.as_deref(),
270            source.as_deref(),
271            format,
272            verbose,
273        ),
274        CatalogCommands::Show {
275            id,
276            catalog,
277            all_contigs,
278        } => run_show(id, catalog, all_contigs, format),
279        CatalogCommands::Export { output, catalog } => run_export(output, catalog),
280        CatalogCommands::ListHierarchical { catalog } => {
281            run_list_hierarchical(catalog, format, verbose)
282        }
283        CatalogCommands::BuildHierarchical {
284            id,
285            name,
286            inputs,
287            assembly_id,
288            version_id,
289            source,
290            download_url,
291            tags,
292            output,
293            append_to,
294            force,
295            require_md5,
296            infer_assembly,
297            no_generate_ucsc_names,
298        } => run_build_hierarchical(
299            id,
300            name,
301            inputs,
302            assembly_id,
303            version_id,
304            source,
305            download_url,
306            tags,
307            output,
308            append_to,
309            force,
310            require_md5,
311            infer_assembly,
312            !no_generate_ucsc_names, // Convert opt-out flag to opt-in parameter
313            format,
314            verbose,
315        ),
316        CatalogCommands::Build {
317            id,
318            name,
319            inputs,
320            assembly,
321            source,
322            description,
323            download_url,
324            assembly_report_url,
325            tags,
326            species,
327            output,
328            append_to,
329            force,
330            input_format,
331            require_md5,
332            no_generate_ucsc_names,
333        } => run_build(
334            id,
335            name,
336            inputs,
337            assembly,
338            source,
339            description,
340            download_url,
341            assembly_report_url,
342            tags,
343            species,
344            output,
345            append_to,
346            force,
347            input_format,
348            require_md5,
349            !no_generate_ucsc_names, // Convert opt-out flag to opt-in parameter
350            format,
351            verbose,
352        ),
353    }
354}
355
356#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
357fn run_list(
358    catalog_path: Option<PathBuf>,
359    assembly_filter: Option<&str>,
360    source_filter: Option<&str>,
361    format: OutputFormat,
362    verbose: bool,
363) -> anyhow::Result<()> {
364    let catalog = if let Some(path) = catalog_path {
365        ReferenceCatalog::load_from_file(&path)?
366    } else {
367        ReferenceCatalog::load_embedded()?
368    };
369
370    if verbose {
371        eprintln!("Loaded catalog with {} references", catalog.len());
372    }
373
374    // Filter references
375    let filtered: Vec<_> = catalog
376        .references
377        .iter()
378        .filter(|r| {
379            if let Some(assembly) = &assembly_filter {
380                let ref_assembly = format!("{}", r.assembly).to_lowercase();
381                if !ref_assembly.contains(&assembly.to_lowercase()) {
382                    return false;
383                }
384            }
385            if let Some(source) = &source_filter {
386                let ref_source = format!("{}", r.source).to_lowercase();
387                if !ref_source.contains(&source.to_lowercase()) {
388                    return false;
389                }
390            }
391            true
392        })
393        .collect();
394
395    match format {
396        OutputFormat::Text => {
397            // Calculate column widths dynamically
398            let id_width = filtered
399                .iter()
400                .map(|r| r.id.0.len())
401                .max()
402                .unwrap_or(2)
403                .max(2);
404            let name_width = filtered
405                .iter()
406                .map(|r| r.display_name.len().min(35))
407                .max()
408                .unwrap_or(4)
409                .max(4);
410            let assembly_width = filtered
411                .iter()
412                .map(|r| format!("{}", r.assembly).len())
413                .max()
414                .unwrap_or(8)
415                .max(8);
416            let source_width = filtered
417                .iter()
418                .map(|r| format!("{}", r.source).len())
419                .max()
420                .unwrap_or(6)
421                .max(6);
422
423            let total_width = id_width + name_width + assembly_width + source_width + 8 + 8;
424
425            println!("Reference Catalog ({} references)\n", filtered.len());
426            println!(
427                "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
428                "ID",
429                "Name",
430                "Assembly",
431                "Source",
432                "Contigs",
433                id_w = id_width,
434                name_w = name_width,
435                asm_w = assembly_width,
436                src_w = source_width
437            );
438            println!("{}", "-".repeat(total_width));
439
440            for r in &filtered {
441                println!(
442                    "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
443                    r.id.0,
444                    truncate(&r.display_name, name_width),
445                    format!("{}", r.assembly),
446                    format!("{}", r.source),
447                    r.contigs.len(),
448                    id_w = id_width,
449                    name_w = name_width,
450                    asm_w = assembly_width,
451                    src_w = source_width
452                );
453                if verbose {
454                    let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
455                    let md5_pct = if r.contigs.is_empty() {
456                        0.0
457                    } else {
458                        100.0 * count_to_f64(md5_count) / count_to_f64(r.contigs.len())
459                    };
460                    if let Some(url) = &r.download_url {
461                        println!(
462                            "  └─ MD5: {}/{} ({:.0}%)  URL: {}",
463                            md5_count,
464                            r.contigs.len(),
465                            md5_pct,
466                            url
467                        );
468                    } else {
469                        println!(
470                            "  └─ MD5: {}/{} ({:.0}%)",
471                            md5_count,
472                            r.contigs.len(),
473                            md5_pct
474                        );
475                    }
476                }
477            }
478        }
479        OutputFormat::Json => {
480            let output: Vec<serde_json::Value> = filtered
481                .iter()
482                .map(|r| {
483                    let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
484                    let role_counts = r.role_counts();
485                    let mut json = serde_json::json!({
486                        "id": r.id.0,
487                        "display_name": r.display_name,
488                        "assembly": format!("{}", r.assembly),
489                        "source": format!("{}", r.source),
490                        "contig_count": r.contigs.len(),
491                        "md5_count": md5_count,
492                        "has_decoy": r.has_decoy(),
493                        "has_alt": r.has_alt(),
494                        "fasta_url": r.download_url,
495                        "assembly_report_url": r.assembly_report_url,
496                        "role_counts": {
497                            "assembled_molecule": role_counts.assembled_molecule,
498                            "alt_scaffold": role_counts.alt_scaffold,
499                            "fix_patch": role_counts.fix_patch,
500                            "novel_patch": role_counts.novel_patch,
501                            "unlocalized_scaffold": role_counts.unlocalized_scaffold,
502                            "unplaced_scaffold": role_counts.unplaced_scaffold,
503                            "unknown": role_counts.unknown,
504                        },
505                        "tags": r.tags,
506                    });
507                    // Add contigs_missing_from_fasta only if non-empty
508                    if !r.contigs_missing_from_fasta.is_empty() {
509                        json["contigs_missing_from_fasta"] =
510                            serde_json::json!(&r.contigs_missing_from_fasta);
511                    }
512                    json
513                })
514                .collect();
515            println!("{}", serde_json::to_string_pretty(&output)?);
516        }
517        OutputFormat::Tsv => {
518            println!("id\tdisplay_name\tassembly\tsource\tcontig_count\tmd5_count\thas_decoy\thas_alt\tdownload_url");
519            for r in &filtered {
520                let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
521                println!(
522                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
523                    r.id.0,
524                    r.display_name,
525                    r.assembly,
526                    r.source,
527                    r.contigs.len(),
528                    md5_count,
529                    r.has_decoy(),
530                    r.has_alt(),
531                    r.download_url.as_deref().unwrap_or("")
532                );
533            }
534        }
535    }
536
537    Ok(())
538}
539
540#[allow(clippy::needless_pass_by_value)] // CLI entry point, values from clap
541fn run_show(
542    id: String,
543    catalog_path: Option<PathBuf>,
544    all_contigs: bool,
545    format: OutputFormat,
546) -> anyhow::Result<()> {
547    let catalog = if let Some(path) = catalog_path {
548        ReferenceCatalog::load_from_file(&path)?
549    } else {
550        ReferenceCatalog::load_embedded()?
551    };
552
553    let ref_id = crate::core::types::ReferenceId::new(&id);
554    let reference = catalog
555        .get(&ref_id)
556        .ok_or_else(|| anyhow::anyhow!("Reference '{id}' not found"))?;
557
558    match format {
559        OutputFormat::Text => {
560            println!("Reference: {}\n", reference.display_name);
561            println!("ID:       {}", reference.id);
562            println!("Assembly: {}", reference.assembly);
563            println!("Source:   {}", reference.source);
564            println!("Naming:   {:?}", reference.naming_convention);
565            println!("Contigs:  {}", reference.contigs.len());
566            println!("Has Decoy: {}", reference.has_decoy());
567            println!("Has ALT:   {}", reference.has_alt());
568
569            if let Some(desc) = &reference.description {
570                println!("\nDescription: {desc}");
571            }
572
573            if let Some(url) = &reference.download_url {
574                println!("\nDownload URL: {url}");
575            }
576
577            if !reference.tags.is_empty() {
578                println!("\nTags: {}", reference.tags.join(", "));
579            }
580
581            let contigs_to_show = if all_contigs {
582                &reference.contigs[..]
583            } else {
584                &reference.contigs[..reference.contigs.len().min(25)]
585            };
586
587            println!("\nContigs:");
588            println!("{:<25} {:>15} MD5", "Name", "Length");
589            println!("{}", "-".repeat(80));
590            for contig in contigs_to_show {
591                println!(
592                    "{:<25} {:>15} {}",
593                    contig.name,
594                    contig.length,
595                    contig.md5.as_deref().unwrap_or("-")
596                );
597            }
598
599            if !all_contigs && reference.contigs.len() > 25 {
600                println!(
601                    "\n... and {} more contigs (use --all-contigs to show all)",
602                    reference.contigs.len() - 25
603                );
604            }
605        }
606        OutputFormat::Json => {
607            println!("{}", serde_json::to_string_pretty(&reference)?);
608        }
609        OutputFormat::Tsv => {
610            println!("name\tlength\tmd5");
611            for contig in &reference.contigs {
612                println!(
613                    "{}\t{}\t{}",
614                    contig.name,
615                    contig.length,
616                    contig.md5.as_deref().unwrap_or("")
617                );
618            }
619        }
620    }
621
622    Ok(())
623}
624
625#[allow(clippy::needless_pass_by_value)] // CLI entry point, values from clap
626fn run_export(output: PathBuf, catalog_path: Option<PathBuf>) -> anyhow::Result<()> {
627    let catalog = if let Some(path) = catalog_path {
628        ReferenceCatalog::load_from_file(&path)?
629    } else {
630        ReferenceCatalog::load_embedded()?
631    };
632
633    let json = catalog.to_json()?;
634    std::fs::write(&output, json)?;
635
636    println!(
637        "Exported {} references to {}",
638        catalog.len(),
639        output.display()
640    );
641
642    Ok(())
643}
644
645#[allow(clippy::needless_pass_by_value, clippy::too_many_lines)] // CLI entry point; TODO: refactor
646fn run_list_hierarchical(
647    catalog_path: PathBuf,
648    format: OutputFormat,
649    verbose: bool,
650) -> anyhow::Result<()> {
651    let catalog = HierarchicalCatalog::load(&catalog_path)?;
652
653    if verbose {
654        eprintln!(
655            "Loaded hierarchical catalog v{} with {} assemblies",
656            catalog.version,
657            catalog.assemblies.len()
658        );
659    }
660
661    match format {
662        OutputFormat::Text => {
663            println!("Hierarchical Reference Catalog (v{})\n", catalog.version);
664
665            // Count totals
666            let mut total_versions = 0;
667            let mut total_distributions = 0;
668            let mut total_contigs = 0;
669
670            for assembly in &catalog.assemblies {
671                total_versions += assembly.versions.len();
672                for version in &assembly.versions {
673                    total_distributions += version.fasta_distributions.len();
674                    for dist in &version.fasta_distributions {
675                        total_contigs += dist.contigs.len();
676                    }
677                }
678            }
679
680            println!(
681                "Summary: {} assemblies, {} versions, {} distributions, {} total contigs\n",
682                catalog.assemblies.len(),
683                total_versions,
684                total_distributions,
685                total_contigs
686            );
687
688            // List hierarchy
689            for assembly in &catalog.assemblies {
690                println!("{} ({})", assembly.name, assembly.id);
691                println!("  Organism: {}", assembly.organism);
692
693                for version in &assembly.versions {
694                    println!("\n  Version: {} ({})", version.version, version.id);
695
696                    // Show report source
697                    match &version.source {
698                        crate::core::assembly::ReportSource::Ncbi { accession, .. } => {
699                            println!("    Source: NCBI ({accession})");
700                        }
701                        crate::core::assembly::ReportSource::DerivedFromFasta {
702                            base_assembly,
703                            ..
704                        } => {
705                            if let Some(base) = base_assembly {
706                                println!("    Source: Derived from FASTA (base: {base})");
707                            } else {
708                                println!("    Source: Derived from FASTA");
709                            }
710                        }
711                        crate::core::assembly::ReportSource::Manual { .. } => {
712                            println!("    Source: Manual");
713                        }
714                    }
715
716                    if !version.report_contigs.is_empty() {
717                        println!("    Report contigs: {}", version.report_contigs.len());
718                    }
719
720                    println!("    Distributions:");
721                    for dist in &version.fasta_distributions {
722                        let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
723                        println!(
724                            "      - {} ({}): {} contigs, {} with MD5",
725                            dist.display_name,
726                            dist.id,
727                            dist.contigs.len(),
728                            md5_count
729                        );
730
731                        if verbose {
732                            // Show presence breakdown
733                            let counts = dist.presence_counts();
734                            if counts.in_both > 0 || counts.fasta_only > 0 {
735                                println!(
736                                    "        Presence: {} in-both, {} fasta-only",
737                                    counts.in_both, counts.fasta_only
738                                );
739                            }
740
741                            if let Some(url) = &dist.download_url {
742                                println!("        URL: {url}");
743                            }
744                        }
745                    }
746                }
747                println!();
748            }
749
750            // Standalone distributions
751            if !catalog.standalone_distributions.is_empty() {
752                println!("Standalone Distributions:");
753                for dist in &catalog.standalone_distributions {
754                    let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
755                    println!(
756                        "  - {} ({}): {} contigs, {} with MD5",
757                        dist.display_name,
758                        dist.id,
759                        dist.contigs.len(),
760                        md5_count
761                    );
762                }
763            }
764        }
765        OutputFormat::Json => {
766            println!("{}", serde_json::to_string_pretty(&catalog)?);
767        }
768        OutputFormat::Tsv => {
769            println!(
770                "assembly_id\tversion_id\tdistribution_id\tdisplay_name\tcontig_count\tmd5_count"
771            );
772            for assembly in &catalog.assemblies {
773                for version in &assembly.versions {
774                    for dist in &version.fasta_distributions {
775                        let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
776                        println!(
777                            "{}\t{}\t{}\t{}\t{}\t{}",
778                            assembly.id,
779                            version.id,
780                            dist.id,
781                            dist.display_name,
782                            dist.contigs.len(),
783                            md5_count
784                        );
785                    }
786                }
787            }
788            // Standalone
789            for dist in &catalog.standalone_distributions {
790                let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
791                println!(
792                    "\t\t{}\t{}\t{}\t{}",
793                    dist.id,
794                    dist.display_name,
795                    dist.contigs.len(),
796                    md5_count
797                );
798            }
799        }
800    }
801
802    Ok(())
803}
804
805#[allow(
806    clippy::too_many_arguments,
807    clippy::needless_pass_by_value,
808    clippy::option_option,
809    clippy::too_many_lines,
810    clippy::fn_params_excessive_bools
811)] // CLI entry point
812fn run_build_hierarchical(
813    id: String,
814    name: String,
815    inputs: Vec<PathBuf>,
816    assembly_id: Option<String>,
817    version_id: Option<String>,
818    source: Option<String>,
819    download_url: Option<String>,
820    tags: Option<String>,
821    output: Option<PathBuf>,
822    append_to: Option<PathBuf>,
823    force: bool,
824    require_md5: bool,
825    infer_assembly: Option<Option<PathBuf>>,
826    generate_ucsc_names: bool,
827    format: OutputFormat,
828    verbose: bool,
829) -> anyhow::Result<()> {
830    use crate::catalog::builder::DistributionBuilder;
831
832    // Parse source
833    let ref_source = source.map_or(ReferenceSource::Custom("custom".to_string()), |s| {
834        parse_reference_source(&s)
835    });
836
837    // Parse tags
838    let tags: Vec<String> = tags
839        .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
840        .unwrap_or_default();
841
842    // Create builder with UCSC name generation option
843    let mut builder = DistributionBuilder::new(&id)
844        .with_display_name(&name)
845        .with_source(ref_source)
846        .with_generate_ucsc_names(generate_ucsc_names);
847
848    if let Some(url) = download_url {
849        builder = builder.with_download_url(url);
850    }
851    if !tags.is_empty() {
852        builder = builder.with_tags(tags);
853    }
854
855    // Process input files
856    for input_path in &inputs {
857        if !input_path.exists() {
858            anyhow::bail!("Input file not found: {}", input_path.display());
859        }
860
861        if verbose {
862            eprintln!("Processing: {}", input_path.display());
863        }
864
865        builder.add_input(input_path)?;
866    }
867
868    // Build the distribution
869    let dist = builder.build()?;
870
871    // Check MD5 requirement
872    if require_md5 {
873        let missing_md5: Vec<_> = dist
874            .contigs
875            .iter()
876            .filter(|c| c.md5.is_empty())
877            .map(|c| c.name.as_str())
878            .collect();
879
880        if !missing_md5.is_empty() {
881            anyhow::bail!(
882                "MD5 required but {} contig(s) lack MD5: {}",
883                missing_md5.len(),
884                missing_md5.join(", ")
885            );
886        }
887    }
888
889    // Summary
890    let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
891    if verbose {
892        eprintln!(
893            "Built distribution '{}' with {} contigs ({} with MD5)",
894            id,
895            dist.contigs.len(),
896            md5_count
897        );
898    }
899
900    // Inference of base assembly
901    let (inferred_assembly_id, inferred_version_id) = if infer_assembly.is_some() {
902        // Load catalog for inference
903        let infer_catalog = match &infer_assembly {
904            Some(Some(path)) => Some(HierarchicalCatalog::load(path)?),
905            Some(None) => {
906                // Try to use append_to catalog first, otherwise try embedded (which doesn't exist in hierarchical format)
907                if let Some(ref append_path) = append_to {
908                    Some(HierarchicalCatalog::load(append_path)?)
909                } else {
910                    if verbose {
911                        eprintln!("Warning: No catalog specified for inference. Use --infer-assembly=<path> or --append-to");
912                    }
913                    None
914                }
915            }
916            None => None,
917        };
918
919        if let Some(ref catalog) = infer_catalog {
920            if let Some(inferred) = catalog.infer_base_assembly_default(&dist.contigs) {
921                if verbose {
922                    eprintln!(
923                        "Inferred base assembly: {} {} ({:.1}% match, {}/{} contigs)",
924                        inferred.assembly_name,
925                        inferred.version_string,
926                        inferred.match_rate * 100.0,
927                        inferred.matched_contigs,
928                        inferred.total_input_contigs
929                    );
930                }
931                (
932                    assembly_id.clone().or(Some(inferred.assembly_id)),
933                    version_id.clone().or(Some(inferred.version_id)),
934                )
935            } else {
936                if verbose {
937                    eprintln!("Could not infer base assembly (no match above 90% threshold)");
938                }
939                (assembly_id.clone(), version_id.clone())
940            }
941        } else {
942            (assembly_id.clone(), version_id.clone())
943        }
944    } else {
945        (assembly_id.clone(), version_id.clone())
946    };
947
948    // Output handling
949    if let Some(append_path) = append_to {
950        // Append to existing catalog
951        let mut catalog = HierarchicalCatalog::load(&append_path)?;
952
953        // Check if we need to add to a specific assembly/version (using inferred if available)
954        if let (Some(asm_id), Some(ver_id)) = (&inferred_assembly_id, &inferred_version_id) {
955            let mut found = false;
956            for assembly in &mut catalog.assemblies {
957                if assembly.id == *asm_id {
958                    for version in &mut assembly.versions {
959                        if version.id == *ver_id {
960                            // Check for existing distribution
961                            if !force && version.fasta_distributions.iter().any(|d| d.id == id) {
962                                anyhow::bail!(
963                                    "Distribution '{id}' already exists in version '{ver_id}'. Use --force to overwrite."
964                                );
965                            }
966
967                            // Remove existing if force
968                            version.fasta_distributions.retain(|d| d.id != id);
969                            version.fasta_distributions.push(dist.clone());
970                            found = true;
971                            break;
972                        }
973                    }
974                }
975            }
976            if !found {
977                anyhow::bail!("Assembly '{asm_id}' with version '{ver_id}' not found in catalog");
978            }
979        } else {
980            // Add as standalone distribution
981            if !force && catalog.standalone_distributions.iter().any(|d| d.id == id) {
982                anyhow::bail!(
983                    "Standalone distribution '{id}' already exists. Use --force to overwrite."
984                );
985            }
986            catalog.standalone_distributions.retain(|d| d.id != id);
987            catalog.standalone_distributions.push(dist.clone());
988        }
989
990        catalog.save(&append_path)?;
991        eprintln!("Added distribution '{}' to {}", id, append_path.display());
992    } else if let Some(out_path) = output {
993        // Create new output
994        if out_path.exists() && !force {
995            anyhow::bail!(
996                "Output file '{}' exists. Use --force to overwrite.",
997                out_path.display()
998            );
999        }
1000
1001        // Output as standalone distribution JSON or wrap in catalog
1002        if let OutputFormat::Json = format {
1003            // Just output the distribution as JSON
1004            let json = serde_json::to_string_pretty(&dist)?;
1005            std::fs::write(&out_path, json)?;
1006            eprintln!("Wrote distribution to {}", out_path.display());
1007        } else {
1008            // Create a catalog with just this distribution
1009            let catalog = HierarchicalCatalog::new().with_standalone_distribution(dist);
1010            catalog.save(&out_path)?;
1011            eprintln!("Wrote hierarchical catalog to {}", out_path.display());
1012        }
1013    } else {
1014        // Print to stdout
1015        match format {
1016            OutputFormat::Json => {
1017                println!("{}", serde_json::to_string_pretty(&dist)?);
1018            }
1019            OutputFormat::Text => {
1020                print_distribution_summary(&dist);
1021            }
1022            OutputFormat::Tsv => {
1023                println!("name\tlength\tmd5\treport_contig_id");
1024                for c in &dist.contigs {
1025                    println!(
1026                        "{}\t{}\t{}\t{}",
1027                        c.name,
1028                        c.length,
1029                        c.md5,
1030                        c.report_contig_id
1031                            .map(|i| i.to_string())
1032                            .unwrap_or_default()
1033                    );
1034                }
1035            }
1036        }
1037    }
1038
1039    Ok(())
1040}
1041
1042fn print_distribution_summary(dist: &crate::core::assembly::FastaDistribution) {
1043    println!("Distribution: {} ({})", dist.display_name, dist.id);
1044    println!("Source: {:?}", dist.source);
1045    if let Some(url) = &dist.download_url {
1046        println!("Download URL: {url}");
1047    }
1048    if !dist.tags.is_empty() {
1049        println!("Tags: {}", dist.tags.join(", "));
1050    }
1051    println!("Contigs: {}", dist.contigs.len());
1052
1053    let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
1054    println!("With MD5: {md5_count}");
1055
1056    let linked = dist
1057        .contigs
1058        .iter()
1059        .filter(|c| c.report_contig_id.is_some())
1060        .count();
1061    println!("Linked to report: {linked}");
1062
1063    // Show presence counts
1064    let counts = dist.presence_counts();
1065    if counts.in_both > 0 || counts.fasta_only > 0 {
1066        println!(
1067            "Presence: {} in-both, {} fasta-only",
1068            counts.in_both, counts.fasta_only
1069        );
1070    }
1071}
1072
1073fn parse_reference_source(s: &str) -> ReferenceSource {
1074    match s.to_lowercase().as_str() {
1075        "ucsc" => ReferenceSource::Ucsc,
1076        "ncbi" => ReferenceSource::Ncbi,
1077        "broad" => ReferenceSource::Broad,
1078        "ensembl" => ReferenceSource::Ensembl,
1079        "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1080        "dragen" | "illumina" => ReferenceSource::Illumina,
1081        _ => ReferenceSource::Custom(s.to_string()),
1082    }
1083}
1084
1085fn truncate(s: &str, max_len: usize) -> String {
1086    if s.len() <= max_len {
1087        s.to_string()
1088    } else {
1089        format!("{}...", &s[..max_len - 3])
1090    }
1091}
1092
1093#[allow(
1094    clippy::too_many_arguments,
1095    clippy::needless_pass_by_value,
1096    clippy::too_many_lines,
1097    clippy::fn_params_excessive_bools
1098)] // CLI entry point
1099fn run_build(
1100    id: String,
1101    name: String,
1102    inputs: Vec<PathBuf>,
1103    assembly: Option<String>,
1104    source: Option<String>,
1105    description: Option<String>,
1106    download_url: Option<String>,
1107    assembly_report_url: Option<String>,
1108    tags: Option<String>,
1109    species: Option<String>,
1110    output: Option<PathBuf>,
1111    append_to: Option<PathBuf>,
1112    force: bool,
1113    input_format: Option<InputFormatArg>,
1114    require_md5: bool,
1115    generate_ucsc_names: bool,
1116    format: OutputFormat,
1117    verbose: bool,
1118) -> anyhow::Result<()> {
1119    // Parse assembly
1120    let assembly = assembly.map(|s| parse_assembly(&s));
1121
1122    // Parse source
1123    let source = source.map(|s| parse_source(&s));
1124
1125    // Parse tags
1126    let tags: Vec<String> = tags
1127        .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
1128        .unwrap_or_default();
1129
1130    // Create builder with UCSC name generation option
1131    let mut builder = ReferenceBuilder::new(&id, &name).generate_ucsc_names(generate_ucsc_names);
1132
1133    if let Some(assembly) = assembly {
1134        builder = builder.assembly(assembly);
1135    }
1136    if let Some(source) = source {
1137        builder = builder.source(source);
1138    }
1139    if let Some(desc) = description {
1140        builder = builder.description(desc);
1141    }
1142    if let Some(url) = download_url {
1143        builder = builder.download_url(url);
1144    }
1145    if let Some(url) = assembly_report_url {
1146        builder = builder.assembly_report_url(url);
1147    }
1148    if !tags.is_empty() {
1149        builder = builder.tags(tags);
1150    }
1151    if let Some(sp) = species {
1152        builder = builder.species(sp);
1153    }
1154
1155    // Process input files
1156    for input_path in &inputs {
1157        if !input_path.exists() {
1158            anyhow::bail!("Input file not found: {}", input_path.display());
1159        }
1160
1161        if verbose {
1162            eprintln!("Processing: {}", input_path.display());
1163        }
1164
1165        if let Some(fmt) = input_format {
1166            builder.add_input_with_format(input_path, fmt.into())?;
1167        } else {
1168            builder.add_input(input_path)?;
1169        }
1170    }
1171
1172    // Get summary before building
1173    let summary = builder.summary();
1174
1175    // Check for conflicts
1176    if !summary.conflicts.is_empty() {
1177        eprintln!("Build failed due to conflicts:");
1178        for conflict in &summary.conflicts {
1179            eprintln!("  - {conflict}");
1180        }
1181        anyhow::bail!(
1182            "Build failed: {} conflict(s) detected",
1183            summary.conflicts.len()
1184        );
1185    }
1186
1187    // Check MD5 requirement
1188    if require_md5 && summary.with_md5 < summary.total_contigs {
1189        anyhow::bail!(
1190            "Build failed: --require-md5 specified but only {}/{} contigs have MD5",
1191            summary.with_md5,
1192            summary.total_contigs
1193        );
1194    }
1195
1196    // Build the reference
1197    let reference = builder.build()?;
1198
1199    // Print summary
1200    if verbose || matches!(format, OutputFormat::Text) {
1201        eprintln!("{summary}");
1202    }
1203
1204    // Handle output
1205    if let Some(catalog_path) = append_to {
1206        // Append to existing catalog
1207        let mut catalog = if catalog_path.exists() {
1208            ReferenceCatalog::load_from_file(&catalog_path)?
1209        } else {
1210            ReferenceCatalog::new()
1211        };
1212
1213        // Check if ID already exists
1214        let ref_id = crate::core::types::ReferenceId::new(&id);
1215        if catalog.get(&ref_id).is_some() {
1216            if force {
1217                eprintln!("Warning: Overwriting existing reference '{id}'");
1218                // Remove old reference by rebuilding catalog without it
1219                let refs: Vec<_> = catalog
1220                    .references
1221                    .into_iter()
1222                    .filter(|r| r.id != ref_id)
1223                    .collect();
1224                catalog = ReferenceCatalog::new();
1225                for r in refs {
1226                    catalog.add_reference(r);
1227                }
1228            } else {
1229                anyhow::bail!(
1230                    "Reference '{id}' already exists in catalog. Use --force to overwrite."
1231                );
1232            }
1233        }
1234
1235        catalog.add_reference(reference);
1236        let json = catalog.to_json()?;
1237        std::fs::write(&catalog_path, json)?;
1238
1239        println!(
1240            "Added reference '{}' to {} ({} total references)",
1241            id,
1242            catalog_path.display(),
1243            catalog.len()
1244        );
1245    } else if let Some(output_path) = output {
1246        // Write single reference to file
1247        let json = serde_json::to_string_pretty(&reference)?;
1248        std::fs::write(&output_path, &json)?;
1249        println!("Wrote reference '{}' to {}", id, output_path.display());
1250    } else {
1251        // Print to stdout
1252        match format {
1253            OutputFormat::Json => {
1254                println!("{}", serde_json::to_string_pretty(&reference)?);
1255            }
1256            OutputFormat::Text | OutputFormat::Tsv => {
1257                // Print summary info
1258                println!("Reference: {}", reference.display_name);
1259                println!("ID:        {}", reference.id);
1260                println!("Assembly:  {}", reference.assembly);
1261                println!("Source:    {}", reference.source);
1262                println!("Contigs:   {}", reference.contigs.len());
1263                println!();
1264                println!("Use --output <file> to save as JSON");
1265            }
1266        }
1267    }
1268
1269    Ok(())
1270}
1271
1272fn parse_assembly(s: &str) -> Assembly {
1273    let lower = s.to_lowercase();
1274    match lower.as_str() {
1275        "grch37" | "hg19" | "b37" => Assembly::Grch37,
1276        "grch38" | "hg38" => Assembly::Grch38,
1277        _ => Assembly::Other(s.to_string()),
1278    }
1279}
1280
1281fn parse_source(s: &str) -> ReferenceSource {
1282    let lower = s.to_lowercase();
1283    match lower.as_str() {
1284        "ucsc" => ReferenceSource::Ucsc,
1285        "ncbi" | "grc" => ReferenceSource::Ncbi,
1286        "broad" => ReferenceSource::Broad,
1287        "ensembl" => ReferenceSource::Ensembl,
1288        "illumina" | "dragen" => ReferenceSource::Illumina,
1289        "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1290        _ => ReferenceSource::Custom(s.to_string()),
1291    }
1292}