Skip to main content

ref_solver/cli/
catalog.rs

1use std::path::PathBuf;
2
3use clap::{Args, Subcommand};
4
5use crate::catalog::builder::{InputFormat, ReferenceBuilder};
6use crate::catalog::hierarchical::HierarchicalCatalog;
7use crate::catalog::store::ReferenceCatalog;
8use crate::cli::OutputFormat;
9use crate::core::types::{Assembly, ReferenceSource};
10
11/// Helper function to convert usize count to f64 with explicit precision loss allowance
12#[inline]
13fn count_to_f64(count: usize) -> f64 {
14    #[allow(clippy::cast_precision_loss)]
15    {
16        count as f64
17    }
18}
19
20#[derive(Args)]
21pub struct CatalogArgs {
22    #[command(subcommand)]
23    pub command: CatalogCommands,
24}
25
26#[derive(Subcommand)]
27pub enum CatalogCommands {
28    /// List all references in the catalog
29    List {
30        /// Path to custom catalog file
31        #[arg(long)]
32        catalog: Option<PathBuf>,
33
34        /// Filter by assembly (e.g., "`GRCh38`")
35        #[arg(long)]
36        assembly: Option<String>,
37
38        /// Filter by source (e.g., "UCSC")
39        #[arg(long)]
40        source: Option<String>,
41    },
42
43    /// Show details of a specific reference
44    Show {
45        /// Reference ID
46        #[arg(required = true)]
47        id: String,
48
49        /// Path to custom catalog file
50        #[arg(long)]
51        catalog: Option<PathBuf>,
52
53        /// Show all contigs
54        #[arg(long)]
55        all_contigs: bool,
56    },
57
58    /// Export the catalog to a file
59    Export {
60        /// Output file path
61        #[arg(required = true)]
62        output: PathBuf,
63
64        /// Path to custom catalog file to export (defaults to embedded)
65        #[arg(long)]
66        catalog: Option<PathBuf>,
67    },
68
69    /// List hierarchical catalog contents (assemblies, versions, distributions)
70    ListHierarchical {
71        /// Path to hierarchical catalog file
72        #[arg(required = true)]
73        catalog: PathBuf,
74    },
75
76    /// Build a hierarchical catalog entry (`FastaDistribution`)
77    BuildHierarchical {
78        /// Distribution ID (e.g., "`hg38_custom`")
79        #[arg(long, required = true)]
80        id: String,
81
82        /// Display name (e.g., "hg38 Custom Build")
83        #[arg(long, required = true)]
84        name: String,
85
86        /// Input file(s) - can be specified multiple times
87        #[arg(short, long = "input", required = true, num_args = 1..)]
88        inputs: Vec<PathBuf>,
89
90        /// Assembly ID to attach to (e.g., "grch38")
91        #[arg(long)]
92        assembly_id: Option<String>,
93
94        /// Version ID to attach to (e.g., "`grch38_p14`")
95        #[arg(long)]
96        version_id: Option<String>,
97
98        /// Source organization (ucsc, ncbi, broad, ensembl, 1kg, dragen, gdc, or custom)
99        #[arg(long)]
100        source: Option<String>,
101
102        /// Reference FASTA download URL
103        #[arg(long)]
104        download_url: Option<String>,
105
106        /// Tags (comma-separated)
107        #[arg(long)]
108        tags: Option<String>,
109
110        /// Output file (creates new hierarchical catalog or standalone distribution JSON)
111        #[arg(short, long)]
112        output: Option<PathBuf>,
113
114        /// Append to existing hierarchical catalog
115        #[arg(long)]
116        append_to: Option<PathBuf>,
117
118        /// Overwrite if distribution ID already exists
119        #[arg(long)]
120        force: bool,
121
122        /// Require MD5 checksums for all contigs
123        #[arg(long)]
124        require_md5: bool,
125
126        /// Infer base assembly by matching MD5s against an existing catalog
127        /// If no path given, uses the embedded catalog (or --append-to catalog)
128        #[arg(long)]
129        #[allow(clippy::option_option)]
130        // Distinguishes: not present / present without value / present with value
131        infer_assembly: Option<Option<PathBuf>>,
132
133        /// Disable automatic generation of UCSC-style names for patches.
134        ///
135        /// By default, when parsing NCBI assembly reports, UCSC-style names are
136        /// generated for fix-patches and novel-patches that have "na" in the
137        /// UCSC-style-name column (common in reports prior to p13).
138        ///
139        /// Use this flag to disable this behavior and only use names explicitly
140        /// present in the assembly report.
141        ///
142        /// See: <https://genome.ucsc.edu/FAQ/FAQdownloads.html>
143        #[arg(long)]
144        no_generate_ucsc_names: bool,
145    },
146
147    /// Build a new reference entry from input files
148    Build {
149        /// Unique reference ID (e.g., "`grch38_custom`")
150        #[arg(long, required = true)]
151        id: String,
152
153        /// Display name (e.g., "`GRCh38` Custom Build")
154        #[arg(long, required = true)]
155        name: String,
156
157        /// Input file(s) - can be specified multiple times
158        /// Supported formats: .dict, .fai, .sam, .bam, .cram, .vcf, _`assembly_report.txt`
159        #[arg(short, long = "input", required = true, num_args = 1..)]
160        inputs: Vec<PathBuf>,
161
162        /// Assembly version (grch37, grch38, or custom name)
163        #[arg(long)]
164        assembly: Option<String>,
165
166        /// Source organization (ucsc, ncbi, broad, ensembl, illumina, 1kg, or custom)
167        #[arg(long)]
168        source: Option<String>,
169
170        /// Description text
171        #[arg(long)]
172        description: Option<String>,
173
174        /// Reference FASTA download URL
175        #[arg(long)]
176        download_url: Option<String>,
177
178        /// NCBI assembly report URL
179        #[arg(long)]
180        assembly_report_url: Option<String>,
181
182        /// Comma-separated tags
183        #[arg(long)]
184        tags: Option<String>,
185
186        /// Output file (JSON). If not specified, prints to stdout
187        #[arg(short, long)]
188        output: Option<PathBuf>,
189
190        /// Append to existing catalog file
191        #[arg(long)]
192        append_to: Option<PathBuf>,
193
194        /// Force overwrite if ID already exists in catalog
195        #[arg(long)]
196        force: bool,
197
198        /// Force input format instead of auto-detection
199        #[arg(long, value_enum)]
200        input_format: Option<InputFormatArg>,
201
202        /// Error if any contig lacks MD5 checksum
203        #[arg(long)]
204        require_md5: bool,
205
206        /// Disable automatic generation of UCSC-style names for patches.
207        ///
208        /// By default, when parsing NCBI assembly reports, UCSC-style names are
209        /// generated for fix-patches and novel-patches that have "na" in the
210        /// UCSC-style-name column (common in reports prior to p13).
211        ///
212        /// Use this flag to disable this behavior and only use names explicitly
213        /// present in the assembly report.
214        ///
215        /// See: <https://genome.ucsc.edu/FAQ/FAQdownloads.html>
216        #[arg(long)]
217        no_generate_ucsc_names: bool,
218    },
219}
220
221/// Input format argument for CLI
222#[derive(Clone, Copy, Debug, clap::ValueEnum)]
223pub enum InputFormatArg {
224    Dict,
225    Fai,
226    Fasta,
227    NcbiReport,
228    Sam,
229    Bam,
230    Cram,
231    Vcf,
232    Tsv,
233}
234
235impl From<InputFormatArg> for InputFormat {
236    fn from(arg: InputFormatArg) -> Self {
237        match arg {
238            InputFormatArg::Dict => InputFormat::Dict,
239            InputFormatArg::Fai => InputFormat::Fai,
240            InputFormatArg::Fasta => InputFormat::Fasta,
241            InputFormatArg::NcbiReport => InputFormat::NcbiReport,
242            InputFormatArg::Sam => InputFormat::Sam,
243            InputFormatArg::Bam => InputFormat::Bam,
244            InputFormatArg::Cram => InputFormat::Cram,
245            InputFormatArg::Vcf => InputFormat::Vcf,
246            InputFormatArg::Tsv => InputFormat::Tsv,
247        }
248    }
249}
250
251/// Execute catalog subcommand
252///
253/// # Errors
254///
255/// Returns an error if the catalog cannot be loaded or the operation fails.
256pub fn run(args: CatalogArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
257    match args.command {
258        CatalogCommands::List {
259            catalog,
260            assembly,
261            source,
262        } => run_list(
263            catalog,
264            assembly.as_deref(),
265            source.as_deref(),
266            format,
267            verbose,
268        ),
269        CatalogCommands::Show {
270            id,
271            catalog,
272            all_contigs,
273        } => run_show(id, catalog, all_contigs, format),
274        CatalogCommands::Export { output, catalog } => run_export(output, catalog),
275        CatalogCommands::ListHierarchical { catalog } => {
276            run_list_hierarchical(catalog, format, verbose)
277        }
278        CatalogCommands::BuildHierarchical {
279            id,
280            name,
281            inputs,
282            assembly_id,
283            version_id,
284            source,
285            download_url,
286            tags,
287            output,
288            append_to,
289            force,
290            require_md5,
291            infer_assembly,
292            no_generate_ucsc_names,
293        } => run_build_hierarchical(
294            id,
295            name,
296            inputs,
297            assembly_id,
298            version_id,
299            source,
300            download_url,
301            tags,
302            output,
303            append_to,
304            force,
305            require_md5,
306            infer_assembly,
307            !no_generate_ucsc_names, // Convert opt-out flag to opt-in parameter
308            format,
309            verbose,
310        ),
311        CatalogCommands::Build {
312            id,
313            name,
314            inputs,
315            assembly,
316            source,
317            description,
318            download_url,
319            assembly_report_url,
320            tags,
321            output,
322            append_to,
323            force,
324            input_format,
325            require_md5,
326            no_generate_ucsc_names,
327        } => run_build(
328            id,
329            name,
330            inputs,
331            assembly,
332            source,
333            description,
334            download_url,
335            assembly_report_url,
336            tags,
337            output,
338            append_to,
339            force,
340            input_format,
341            require_md5,
342            !no_generate_ucsc_names, // Convert opt-out flag to opt-in parameter
343            format,
344            verbose,
345        ),
346    }
347}
348
349#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
350fn run_list(
351    catalog_path: Option<PathBuf>,
352    assembly_filter: Option<&str>,
353    source_filter: Option<&str>,
354    format: OutputFormat,
355    verbose: bool,
356) -> anyhow::Result<()> {
357    let catalog = if let Some(path) = catalog_path {
358        ReferenceCatalog::load_from_file(&path)?
359    } else {
360        ReferenceCatalog::load_embedded()?
361    };
362
363    if verbose {
364        eprintln!("Loaded catalog with {} references", catalog.len());
365    }
366
367    // Filter references
368    let filtered: Vec<_> = catalog
369        .references
370        .iter()
371        .filter(|r| {
372            if let Some(assembly) = &assembly_filter {
373                let ref_assembly = format!("{}", r.assembly).to_lowercase();
374                if !ref_assembly.contains(&assembly.to_lowercase()) {
375                    return false;
376                }
377            }
378            if let Some(source) = &source_filter {
379                let ref_source = format!("{}", r.source).to_lowercase();
380                if !ref_source.contains(&source.to_lowercase()) {
381                    return false;
382                }
383            }
384            true
385        })
386        .collect();
387
388    match format {
389        OutputFormat::Text => {
390            // Calculate column widths dynamically
391            let id_width = filtered
392                .iter()
393                .map(|r| r.id.0.len())
394                .max()
395                .unwrap_or(2)
396                .max(2);
397            let name_width = filtered
398                .iter()
399                .map(|r| r.display_name.len().min(35))
400                .max()
401                .unwrap_or(4)
402                .max(4);
403            let assembly_width = filtered
404                .iter()
405                .map(|r| format!("{}", r.assembly).len())
406                .max()
407                .unwrap_or(8)
408                .max(8);
409            let source_width = filtered
410                .iter()
411                .map(|r| format!("{}", r.source).len())
412                .max()
413                .unwrap_or(6)
414                .max(6);
415
416            let total_width = id_width + name_width + assembly_width + source_width + 8 + 8;
417
418            println!("Reference Catalog ({} references)\n", filtered.len());
419            println!(
420                "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
421                "ID",
422                "Name",
423                "Assembly",
424                "Source",
425                "Contigs",
426                id_w = id_width,
427                name_w = name_width,
428                asm_w = assembly_width,
429                src_w = source_width
430            );
431            println!("{}", "-".repeat(total_width));
432
433            for r in &filtered {
434                println!(
435                    "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
436                    r.id.0,
437                    truncate(&r.display_name, name_width),
438                    format!("{}", r.assembly),
439                    format!("{}", r.source),
440                    r.contigs.len(),
441                    id_w = id_width,
442                    name_w = name_width,
443                    asm_w = assembly_width,
444                    src_w = source_width
445                );
446                if verbose {
447                    let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
448                    let md5_pct = if r.contigs.is_empty() {
449                        0.0
450                    } else {
451                        100.0 * count_to_f64(md5_count) / count_to_f64(r.contigs.len())
452                    };
453                    if let Some(url) = &r.download_url {
454                        println!(
455                            "  └─ MD5: {}/{} ({:.0}%)  URL: {}",
456                            md5_count,
457                            r.contigs.len(),
458                            md5_pct,
459                            url
460                        );
461                    } else {
462                        println!(
463                            "  └─ MD5: {}/{} ({:.0}%)",
464                            md5_count,
465                            r.contigs.len(),
466                            md5_pct
467                        );
468                    }
469                }
470            }
471        }
472        OutputFormat::Json => {
473            let output: Vec<serde_json::Value> = filtered
474                .iter()
475                .map(|r| {
476                    let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
477                    let role_counts = r.role_counts();
478                    let mut json = serde_json::json!({
479                        "id": r.id.0,
480                        "display_name": r.display_name,
481                        "assembly": format!("{}", r.assembly),
482                        "source": format!("{}", r.source),
483                        "contig_count": r.contigs.len(),
484                        "md5_count": md5_count,
485                        "has_decoy": r.has_decoy(),
486                        "has_alt": r.has_alt(),
487                        "fasta_url": r.download_url,
488                        "assembly_report_url": r.assembly_report_url,
489                        "role_counts": {
490                            "assembled_molecule": role_counts.assembled_molecule,
491                            "alt_scaffold": role_counts.alt_scaffold,
492                            "fix_patch": role_counts.fix_patch,
493                            "novel_patch": role_counts.novel_patch,
494                            "unlocalized_scaffold": role_counts.unlocalized_scaffold,
495                            "unplaced_scaffold": role_counts.unplaced_scaffold,
496                            "unknown": role_counts.unknown,
497                        },
498                        "tags": r.tags,
499                    });
500                    // Add contigs_missing_from_fasta only if non-empty
501                    if !r.contigs_missing_from_fasta.is_empty() {
502                        json["contigs_missing_from_fasta"] =
503                            serde_json::json!(&r.contigs_missing_from_fasta);
504                    }
505                    json
506                })
507                .collect();
508            println!("{}", serde_json::to_string_pretty(&output)?);
509        }
510        OutputFormat::Tsv => {
511            println!("id\tdisplay_name\tassembly\tsource\tcontig_count\tmd5_count\thas_decoy\thas_alt\tdownload_url");
512            for r in &filtered {
513                let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
514                println!(
515                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
516                    r.id.0,
517                    r.display_name,
518                    r.assembly,
519                    r.source,
520                    r.contigs.len(),
521                    md5_count,
522                    r.has_decoy(),
523                    r.has_alt(),
524                    r.download_url.as_deref().unwrap_or("")
525                );
526            }
527        }
528    }
529
530    Ok(())
531}
532
533#[allow(clippy::needless_pass_by_value)] // CLI entry point, values from clap
534fn run_show(
535    id: String,
536    catalog_path: Option<PathBuf>,
537    all_contigs: bool,
538    format: OutputFormat,
539) -> anyhow::Result<()> {
540    let catalog = if let Some(path) = catalog_path {
541        ReferenceCatalog::load_from_file(&path)?
542    } else {
543        ReferenceCatalog::load_embedded()?
544    };
545
546    let ref_id = crate::core::types::ReferenceId::new(&id);
547    let reference = catalog
548        .get(&ref_id)
549        .ok_or_else(|| anyhow::anyhow!("Reference '{id}' not found"))?;
550
551    match format {
552        OutputFormat::Text => {
553            println!("Reference: {}\n", reference.display_name);
554            println!("ID:       {}", reference.id);
555            println!("Assembly: {}", reference.assembly);
556            println!("Source:   {}", reference.source);
557            println!("Naming:   {:?}", reference.naming_convention);
558            println!("Contigs:  {}", reference.contigs.len());
559            println!("Has Decoy: {}", reference.has_decoy());
560            println!("Has ALT:   {}", reference.has_alt());
561
562            if let Some(desc) = &reference.description {
563                println!("\nDescription: {desc}");
564            }
565
566            if let Some(url) = &reference.download_url {
567                println!("\nDownload URL: {url}");
568            }
569
570            if !reference.tags.is_empty() {
571                println!("\nTags: {}", reference.tags.join(", "));
572            }
573
574            let contigs_to_show = if all_contigs {
575                &reference.contigs[..]
576            } else {
577                &reference.contigs[..reference.contigs.len().min(25)]
578            };
579
580            println!("\nContigs:");
581            println!("{:<25} {:>15} MD5", "Name", "Length");
582            println!("{}", "-".repeat(80));
583            for contig in contigs_to_show {
584                println!(
585                    "{:<25} {:>15} {}",
586                    contig.name,
587                    contig.length,
588                    contig.md5.as_deref().unwrap_or("-")
589                );
590            }
591
592            if !all_contigs && reference.contigs.len() > 25 {
593                println!(
594                    "\n... and {} more contigs (use --all-contigs to show all)",
595                    reference.contigs.len() - 25
596                );
597            }
598        }
599        OutputFormat::Json => {
600            println!("{}", serde_json::to_string_pretty(&reference)?);
601        }
602        OutputFormat::Tsv => {
603            println!("name\tlength\tmd5");
604            for contig in &reference.contigs {
605                println!(
606                    "{}\t{}\t{}",
607                    contig.name,
608                    contig.length,
609                    contig.md5.as_deref().unwrap_or("")
610                );
611            }
612        }
613    }
614
615    Ok(())
616}
617
618#[allow(clippy::needless_pass_by_value)] // CLI entry point, values from clap
619fn run_export(output: PathBuf, catalog_path: Option<PathBuf>) -> anyhow::Result<()> {
620    let catalog = if let Some(path) = catalog_path {
621        ReferenceCatalog::load_from_file(&path)?
622    } else {
623        ReferenceCatalog::load_embedded()?
624    };
625
626    let json = catalog.to_json()?;
627    std::fs::write(&output, json)?;
628
629    println!(
630        "Exported {} references to {}",
631        catalog.len(),
632        output.display()
633    );
634
635    Ok(())
636}
637
638#[allow(clippy::needless_pass_by_value, clippy::too_many_lines)] // CLI entry point; TODO: refactor
639fn run_list_hierarchical(
640    catalog_path: PathBuf,
641    format: OutputFormat,
642    verbose: bool,
643) -> anyhow::Result<()> {
644    let catalog = HierarchicalCatalog::load(&catalog_path)?;
645
646    if verbose {
647        eprintln!(
648            "Loaded hierarchical catalog v{} with {} assemblies",
649            catalog.version,
650            catalog.assemblies.len()
651        );
652    }
653
654    match format {
655        OutputFormat::Text => {
656            println!("Hierarchical Reference Catalog (v{})\n", catalog.version);
657
658            // Count totals
659            let mut total_versions = 0;
660            let mut total_distributions = 0;
661            let mut total_contigs = 0;
662
663            for assembly in &catalog.assemblies {
664                total_versions += assembly.versions.len();
665                for version in &assembly.versions {
666                    total_distributions += version.fasta_distributions.len();
667                    for dist in &version.fasta_distributions {
668                        total_contigs += dist.contigs.len();
669                    }
670                }
671            }
672
673            println!(
674                "Summary: {} assemblies, {} versions, {} distributions, {} total contigs\n",
675                catalog.assemblies.len(),
676                total_versions,
677                total_distributions,
678                total_contigs
679            );
680
681            // List hierarchy
682            for assembly in &catalog.assemblies {
683                println!("{} ({})", assembly.name, assembly.id);
684                println!("  Organism: {}", assembly.organism);
685
686                for version in &assembly.versions {
687                    println!("\n  Version: {} ({})", version.version, version.id);
688
689                    // Show report source
690                    match &version.source {
691                        crate::core::assembly::ReportSource::Ncbi { accession, .. } => {
692                            println!("    Source: NCBI ({accession})");
693                        }
694                        crate::core::assembly::ReportSource::DerivedFromFasta {
695                            base_assembly,
696                            ..
697                        } => {
698                            if let Some(base) = base_assembly {
699                                println!("    Source: Derived from FASTA (base: {base})");
700                            } else {
701                                println!("    Source: Derived from FASTA");
702                            }
703                        }
704                        crate::core::assembly::ReportSource::Manual { .. } => {
705                            println!("    Source: Manual");
706                        }
707                    }
708
709                    if !version.report_contigs.is_empty() {
710                        println!("    Report contigs: {}", version.report_contigs.len());
711                    }
712
713                    println!("    Distributions:");
714                    for dist in &version.fasta_distributions {
715                        let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
716                        println!(
717                            "      - {} ({}): {} contigs, {} with MD5",
718                            dist.display_name,
719                            dist.id,
720                            dist.contigs.len(),
721                            md5_count
722                        );
723
724                        if verbose {
725                            // Show presence breakdown
726                            let counts = dist.presence_counts();
727                            if counts.in_both > 0 || counts.fasta_only > 0 {
728                                println!(
729                                    "        Presence: {} in-both, {} fasta-only",
730                                    counts.in_both, counts.fasta_only
731                                );
732                            }
733
734                            if let Some(url) = &dist.download_url {
735                                println!("        URL: {url}");
736                            }
737                        }
738                    }
739                }
740                println!();
741            }
742
743            // Standalone distributions
744            if !catalog.standalone_distributions.is_empty() {
745                println!("Standalone Distributions:");
746                for dist in &catalog.standalone_distributions {
747                    let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
748                    println!(
749                        "  - {} ({}): {} contigs, {} with MD5",
750                        dist.display_name,
751                        dist.id,
752                        dist.contigs.len(),
753                        md5_count
754                    );
755                }
756            }
757        }
758        OutputFormat::Json => {
759            println!("{}", serde_json::to_string_pretty(&catalog)?);
760        }
761        OutputFormat::Tsv => {
762            println!(
763                "assembly_id\tversion_id\tdistribution_id\tdisplay_name\tcontig_count\tmd5_count"
764            );
765            for assembly in &catalog.assemblies {
766                for version in &assembly.versions {
767                    for dist in &version.fasta_distributions {
768                        let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
769                        println!(
770                            "{}\t{}\t{}\t{}\t{}\t{}",
771                            assembly.id,
772                            version.id,
773                            dist.id,
774                            dist.display_name,
775                            dist.contigs.len(),
776                            md5_count
777                        );
778                    }
779                }
780            }
781            // Standalone
782            for dist in &catalog.standalone_distributions {
783                let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
784                println!(
785                    "\t\t{}\t{}\t{}\t{}",
786                    dist.id,
787                    dist.display_name,
788                    dist.contigs.len(),
789                    md5_count
790                );
791            }
792        }
793    }
794
795    Ok(())
796}
797
798#[allow(
799    clippy::too_many_arguments,
800    clippy::needless_pass_by_value,
801    clippy::option_option,
802    clippy::too_many_lines,
803    clippy::fn_params_excessive_bools
804)] // CLI entry point
805fn run_build_hierarchical(
806    id: String,
807    name: String,
808    inputs: Vec<PathBuf>,
809    assembly_id: Option<String>,
810    version_id: Option<String>,
811    source: Option<String>,
812    download_url: Option<String>,
813    tags: Option<String>,
814    output: Option<PathBuf>,
815    append_to: Option<PathBuf>,
816    force: bool,
817    require_md5: bool,
818    infer_assembly: Option<Option<PathBuf>>,
819    generate_ucsc_names: bool,
820    format: OutputFormat,
821    verbose: bool,
822) -> anyhow::Result<()> {
823    use crate::catalog::builder::DistributionBuilder;
824
825    // Parse source
826    let ref_source = source.map_or(ReferenceSource::Custom("custom".to_string()), |s| {
827        parse_reference_source(&s)
828    });
829
830    // Parse tags
831    let tags: Vec<String> = tags
832        .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
833        .unwrap_or_default();
834
835    // Create builder with UCSC name generation option
836    let mut builder = DistributionBuilder::new(&id)
837        .with_display_name(&name)
838        .with_source(ref_source)
839        .with_generate_ucsc_names(generate_ucsc_names);
840
841    if let Some(url) = download_url {
842        builder = builder.with_download_url(url);
843    }
844    if !tags.is_empty() {
845        builder = builder.with_tags(tags);
846    }
847
848    // Process input files
849    for input_path in &inputs {
850        if !input_path.exists() {
851            anyhow::bail!("Input file not found: {}", input_path.display());
852        }
853
854        if verbose {
855            eprintln!("Processing: {}", input_path.display());
856        }
857
858        builder.add_input(input_path)?;
859    }
860
861    // Build the distribution
862    let dist = builder.build()?;
863
864    // Check MD5 requirement
865    if require_md5 {
866        let missing_md5: Vec<_> = dist
867            .contigs
868            .iter()
869            .filter(|c| c.md5.is_empty())
870            .map(|c| c.name.as_str())
871            .collect();
872
873        if !missing_md5.is_empty() {
874            anyhow::bail!(
875                "MD5 required but {} contig(s) lack MD5: {}",
876                missing_md5.len(),
877                missing_md5.join(", ")
878            );
879        }
880    }
881
882    // Summary
883    let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
884    if verbose {
885        eprintln!(
886            "Built distribution '{}' with {} contigs ({} with MD5)",
887            id,
888            dist.contigs.len(),
889            md5_count
890        );
891    }
892
893    // Inference of base assembly
894    let (inferred_assembly_id, inferred_version_id) = if infer_assembly.is_some() {
895        // Load catalog for inference
896        let infer_catalog = match &infer_assembly {
897            Some(Some(path)) => Some(HierarchicalCatalog::load(path)?),
898            Some(None) => {
899                // Try to use append_to catalog first, otherwise try embedded (which doesn't exist in hierarchical format)
900                if let Some(ref append_path) = append_to {
901                    Some(HierarchicalCatalog::load(append_path)?)
902                } else {
903                    if verbose {
904                        eprintln!("Warning: No catalog specified for inference. Use --infer-assembly=<path> or --append-to");
905                    }
906                    None
907                }
908            }
909            None => None,
910        };
911
912        if let Some(ref catalog) = infer_catalog {
913            if let Some(inferred) = catalog.infer_base_assembly_default(&dist.contigs) {
914                if verbose {
915                    eprintln!(
916                        "Inferred base assembly: {} {} ({:.1}% match, {}/{} contigs)",
917                        inferred.assembly_name,
918                        inferred.version_string,
919                        inferred.match_rate * 100.0,
920                        inferred.matched_contigs,
921                        inferred.total_input_contigs
922                    );
923                }
924                (
925                    assembly_id.clone().or(Some(inferred.assembly_id)),
926                    version_id.clone().or(Some(inferred.version_id)),
927                )
928            } else {
929                if verbose {
930                    eprintln!("Could not infer base assembly (no match above 90% threshold)");
931                }
932                (assembly_id.clone(), version_id.clone())
933            }
934        } else {
935            (assembly_id.clone(), version_id.clone())
936        }
937    } else {
938        (assembly_id.clone(), version_id.clone())
939    };
940
941    // Output handling
942    if let Some(append_path) = append_to {
943        // Append to existing catalog
944        let mut catalog = HierarchicalCatalog::load(&append_path)?;
945
946        // Check if we need to add to a specific assembly/version (using inferred if available)
947        if let (Some(asm_id), Some(ver_id)) = (&inferred_assembly_id, &inferred_version_id) {
948            let mut found = false;
949            for assembly in &mut catalog.assemblies {
950                if assembly.id == *asm_id {
951                    for version in &mut assembly.versions {
952                        if version.id == *ver_id {
953                            // Check for existing distribution
954                            if !force && version.fasta_distributions.iter().any(|d| d.id == id) {
955                                anyhow::bail!(
956                                    "Distribution '{id}' already exists in version '{ver_id}'. Use --force to overwrite."
957                                );
958                            }
959
960                            // Remove existing if force
961                            version.fasta_distributions.retain(|d| d.id != id);
962                            version.fasta_distributions.push(dist.clone());
963                            found = true;
964                            break;
965                        }
966                    }
967                }
968            }
969            if !found {
970                anyhow::bail!("Assembly '{asm_id}' with version '{ver_id}' not found in catalog");
971            }
972        } else {
973            // Add as standalone distribution
974            if !force && catalog.standalone_distributions.iter().any(|d| d.id == id) {
975                anyhow::bail!(
976                    "Standalone distribution '{id}' already exists. Use --force to overwrite."
977                );
978            }
979            catalog.standalone_distributions.retain(|d| d.id != id);
980            catalog.standalone_distributions.push(dist.clone());
981        }
982
983        catalog.save(&append_path)?;
984        eprintln!("Added distribution '{}' to {}", id, append_path.display());
985    } else if let Some(out_path) = output {
986        // Create new output
987        if out_path.exists() && !force {
988            anyhow::bail!(
989                "Output file '{}' exists. Use --force to overwrite.",
990                out_path.display()
991            );
992        }
993
994        // Output as standalone distribution JSON or wrap in catalog
995        if let OutputFormat::Json = format {
996            // Just output the distribution as JSON
997            let json = serde_json::to_string_pretty(&dist)?;
998            std::fs::write(&out_path, json)?;
999            eprintln!("Wrote distribution to {}", out_path.display());
1000        } else {
1001            // Create a catalog with just this distribution
1002            let catalog = HierarchicalCatalog::new().with_standalone_distribution(dist);
1003            catalog.save(&out_path)?;
1004            eprintln!("Wrote hierarchical catalog to {}", out_path.display());
1005        }
1006    } else {
1007        // Print to stdout
1008        match format {
1009            OutputFormat::Json => {
1010                println!("{}", serde_json::to_string_pretty(&dist)?);
1011            }
1012            OutputFormat::Text => {
1013                print_distribution_summary(&dist);
1014            }
1015            OutputFormat::Tsv => {
1016                println!("name\tlength\tmd5\treport_contig_id");
1017                for c in &dist.contigs {
1018                    println!(
1019                        "{}\t{}\t{}\t{}",
1020                        c.name,
1021                        c.length,
1022                        c.md5,
1023                        c.report_contig_id
1024                            .map(|i| i.to_string())
1025                            .unwrap_or_default()
1026                    );
1027                }
1028            }
1029        }
1030    }
1031
1032    Ok(())
1033}
1034
1035fn print_distribution_summary(dist: &crate::core::assembly::FastaDistribution) {
1036    println!("Distribution: {} ({})", dist.display_name, dist.id);
1037    println!("Source: {:?}", dist.source);
1038    if let Some(url) = &dist.download_url {
1039        println!("Download URL: {url}");
1040    }
1041    if !dist.tags.is_empty() {
1042        println!("Tags: {}", dist.tags.join(", "));
1043    }
1044    println!("Contigs: {}", dist.contigs.len());
1045
1046    let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
1047    println!("With MD5: {md5_count}");
1048
1049    let linked = dist
1050        .contigs
1051        .iter()
1052        .filter(|c| c.report_contig_id.is_some())
1053        .count();
1054    println!("Linked to report: {linked}");
1055
1056    // Show presence counts
1057    let counts = dist.presence_counts();
1058    if counts.in_both > 0 || counts.fasta_only > 0 {
1059        println!(
1060            "Presence: {} in-both, {} fasta-only",
1061            counts.in_both, counts.fasta_only
1062        );
1063    }
1064}
1065
1066fn parse_reference_source(s: &str) -> ReferenceSource {
1067    match s.to_lowercase().as_str() {
1068        "ucsc" => ReferenceSource::Ucsc,
1069        "ncbi" => ReferenceSource::Ncbi,
1070        "broad" => ReferenceSource::Broad,
1071        "ensembl" => ReferenceSource::Ensembl,
1072        "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1073        "dragen" | "illumina" => ReferenceSource::Illumina,
1074        _ => ReferenceSource::Custom(s.to_string()),
1075    }
1076}
1077
1078fn truncate(s: &str, max_len: usize) -> String {
1079    if s.len() <= max_len {
1080        s.to_string()
1081    } else {
1082        format!("{}...", &s[..max_len - 3])
1083    }
1084}
1085
1086#[allow(
1087    clippy::too_many_arguments,
1088    clippy::needless_pass_by_value,
1089    clippy::too_many_lines,
1090    clippy::fn_params_excessive_bools
1091)] // CLI entry point
1092fn run_build(
1093    id: String,
1094    name: String,
1095    inputs: Vec<PathBuf>,
1096    assembly: Option<String>,
1097    source: Option<String>,
1098    description: Option<String>,
1099    download_url: Option<String>,
1100    assembly_report_url: Option<String>,
1101    tags: Option<String>,
1102    output: Option<PathBuf>,
1103    append_to: Option<PathBuf>,
1104    force: bool,
1105    input_format: Option<InputFormatArg>,
1106    require_md5: bool,
1107    generate_ucsc_names: bool,
1108    format: OutputFormat,
1109    verbose: bool,
1110) -> anyhow::Result<()> {
1111    // Parse assembly
1112    let assembly = assembly.map(|s| parse_assembly(&s));
1113
1114    // Parse source
1115    let source = source.map(|s| parse_source(&s));
1116
1117    // Parse tags
1118    let tags: Vec<String> = tags
1119        .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
1120        .unwrap_or_default();
1121
1122    // Create builder with UCSC name generation option
1123    let mut builder = ReferenceBuilder::new(&id, &name).generate_ucsc_names(generate_ucsc_names);
1124
1125    if let Some(assembly) = assembly {
1126        builder = builder.assembly(assembly);
1127    }
1128    if let Some(source) = source {
1129        builder = builder.source(source);
1130    }
1131    if let Some(desc) = description {
1132        builder = builder.description(desc);
1133    }
1134    if let Some(url) = download_url {
1135        builder = builder.download_url(url);
1136    }
1137    if let Some(url) = assembly_report_url {
1138        builder = builder.assembly_report_url(url);
1139    }
1140    if !tags.is_empty() {
1141        builder = builder.tags(tags);
1142    }
1143
1144    // Process input files
1145    for input_path in &inputs {
1146        if !input_path.exists() {
1147            anyhow::bail!("Input file not found: {}", input_path.display());
1148        }
1149
1150        if verbose {
1151            eprintln!("Processing: {}", input_path.display());
1152        }
1153
1154        if let Some(fmt) = input_format {
1155            builder.add_input_with_format(input_path, fmt.into())?;
1156        } else {
1157            builder.add_input(input_path)?;
1158        }
1159    }
1160
1161    // Get summary before building
1162    let summary = builder.summary();
1163
1164    // Check for conflicts
1165    if !summary.conflicts.is_empty() {
1166        eprintln!("Build failed due to conflicts:");
1167        for conflict in &summary.conflicts {
1168            eprintln!("  - {conflict}");
1169        }
1170        anyhow::bail!(
1171            "Build failed: {} conflict(s) detected",
1172            summary.conflicts.len()
1173        );
1174    }
1175
1176    // Check MD5 requirement
1177    if require_md5 && summary.with_md5 < summary.total_contigs {
1178        anyhow::bail!(
1179            "Build failed: --require-md5 specified but only {}/{} contigs have MD5",
1180            summary.with_md5,
1181            summary.total_contigs
1182        );
1183    }
1184
1185    // Build the reference
1186    let reference = builder.build()?;
1187
1188    // Print summary
1189    if verbose || matches!(format, OutputFormat::Text) {
1190        eprintln!("{summary}");
1191    }
1192
1193    // Handle output
1194    if let Some(catalog_path) = append_to {
1195        // Append to existing catalog
1196        let mut catalog = if catalog_path.exists() {
1197            ReferenceCatalog::load_from_file(&catalog_path)?
1198        } else {
1199            ReferenceCatalog::new()
1200        };
1201
1202        // Check if ID already exists
1203        let ref_id = crate::core::types::ReferenceId::new(&id);
1204        if catalog.get(&ref_id).is_some() {
1205            if force {
1206                eprintln!("Warning: Overwriting existing reference '{id}'");
1207                // Remove old reference by rebuilding catalog without it
1208                let refs: Vec<_> = catalog
1209                    .references
1210                    .into_iter()
1211                    .filter(|r| r.id != ref_id)
1212                    .collect();
1213                catalog = ReferenceCatalog::new();
1214                for r in refs {
1215                    catalog.add_reference(r);
1216                }
1217            } else {
1218                anyhow::bail!(
1219                    "Reference '{id}' already exists in catalog. Use --force to overwrite."
1220                );
1221            }
1222        }
1223
1224        catalog.add_reference(reference);
1225        let json = catalog.to_json()?;
1226        std::fs::write(&catalog_path, json)?;
1227
1228        println!(
1229            "Added reference '{}' to {} ({} total references)",
1230            id,
1231            catalog_path.display(),
1232            catalog.len()
1233        );
1234    } else if let Some(output_path) = output {
1235        // Write single reference to file
1236        let json = serde_json::to_string_pretty(&reference)?;
1237        std::fs::write(&output_path, &json)?;
1238        println!("Wrote reference '{}' to {}", id, output_path.display());
1239    } else {
1240        // Print to stdout
1241        match format {
1242            OutputFormat::Json => {
1243                println!("{}", serde_json::to_string_pretty(&reference)?);
1244            }
1245            OutputFormat::Text | OutputFormat::Tsv => {
1246                // Print summary info
1247                println!("Reference: {}", reference.display_name);
1248                println!("ID:        {}", reference.id);
1249                println!("Assembly:  {}", reference.assembly);
1250                println!("Source:    {}", reference.source);
1251                println!("Contigs:   {}", reference.contigs.len());
1252                println!();
1253                println!("Use --output <file> to save as JSON");
1254            }
1255        }
1256    }
1257
1258    Ok(())
1259}
1260
1261fn parse_assembly(s: &str) -> Assembly {
1262    let lower = s.to_lowercase();
1263    match lower.as_str() {
1264        "grch37" | "hg19" | "b37" => Assembly::Grch37,
1265        "grch38" | "hg38" => Assembly::Grch38,
1266        _ => Assembly::Other(s.to_string()),
1267    }
1268}
1269
1270fn parse_source(s: &str) -> ReferenceSource {
1271    let lower = s.to_lowercase();
1272    match lower.as_str() {
1273        "ucsc" => ReferenceSource::Ucsc,
1274        "ncbi" | "grc" => ReferenceSource::Ncbi,
1275        "broad" => ReferenceSource::Broad,
1276        "ensembl" => ReferenceSource::Ensembl,
1277        "illumina" | "dragen" => ReferenceSource::Illumina,
1278        "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1279        _ => ReferenceSource::Custom(s.to_string()),
1280    }
1281}