1use std::path::PathBuf;
2
3use clap::{Args, Subcommand};
4
5use crate::catalog::builder::{InputFormat, ReferenceBuilder};
6use crate::catalog::hierarchical::HierarchicalCatalog;
7use crate::catalog::store::ReferenceCatalog;
8use crate::cli::OutputFormat;
9use crate::core::types::{Assembly, ReferenceSource};
10
11#[inline]
13fn count_to_f64(count: usize) -> f64 {
14 #[allow(clippy::cast_precision_loss)]
15 {
16 count as f64
17 }
18}
19
20#[derive(Args)]
21pub struct CatalogArgs {
22 #[command(subcommand)]
23 pub command: CatalogCommands,
24}
25
26#[derive(Subcommand)]
27#[non_exhaustive]
28pub enum CatalogCommands {
29 List {
31 #[arg(long)]
33 catalog: Option<PathBuf>,
34
35 #[arg(long)]
37 assembly: Option<String>,
38
39 #[arg(long)]
41 source: Option<String>,
42 },
43
44 Show {
46 #[arg(required = true)]
48 id: String,
49
50 #[arg(long)]
52 catalog: Option<PathBuf>,
53
54 #[arg(long)]
56 all_contigs: bool,
57 },
58
59 Export {
61 #[arg(required = true)]
63 output: PathBuf,
64
65 #[arg(long)]
67 catalog: Option<PathBuf>,
68 },
69
70 ListHierarchical {
72 #[arg(required = true)]
74 catalog: PathBuf,
75 },
76
77 BuildHierarchical {
79 #[arg(long, required = true)]
81 id: String,
82
83 #[arg(long, required = true)]
85 name: String,
86
87 #[arg(short, long = "input", required = true, num_args = 1..)]
89 inputs: Vec<PathBuf>,
90
91 #[arg(long)]
93 assembly_id: Option<String>,
94
95 #[arg(long)]
97 version_id: Option<String>,
98
99 #[arg(long)]
101 source: Option<String>,
102
103 #[arg(long)]
105 download_url: Option<String>,
106
107 #[arg(long)]
109 tags: Option<String>,
110
111 #[arg(short, long)]
113 output: Option<PathBuf>,
114
115 #[arg(long)]
117 append_to: Option<PathBuf>,
118
119 #[arg(long)]
121 force: bool,
122
123 #[arg(long)]
125 require_md5: bool,
126
127 #[arg(long)]
130 #[allow(clippy::option_option)]
131 infer_assembly: Option<Option<PathBuf>>,
133
134 #[arg(long)]
145 no_generate_ucsc_names: bool,
146 },
147
148 Build {
150 #[arg(long, required = true)]
152 id: String,
153
154 #[arg(long, required = true)]
156 name: String,
157
158 #[arg(short, long = "input", required = true, num_args = 1..)]
161 inputs: Vec<PathBuf>,
162
163 #[arg(long)]
165 assembly: Option<String>,
166
167 #[arg(long)]
169 source: Option<String>,
170
171 #[arg(long)]
173 description: Option<String>,
174
175 #[arg(long)]
177 download_url: Option<String>,
178
179 #[arg(long)]
181 assembly_report_url: Option<String>,
182
183 #[arg(long)]
185 tags: Option<String>,
186
187 #[arg(long)]
189 species: Option<String>,
190
191 #[arg(short, long)]
193 output: Option<PathBuf>,
194
195 #[arg(long)]
197 append_to: Option<PathBuf>,
198
199 #[arg(long)]
201 force: bool,
202
203 #[arg(long, value_enum)]
205 input_format: Option<InputFormatArg>,
206
207 #[arg(long)]
209 require_md5: bool,
210
211 #[arg(long)]
222 no_generate_ucsc_names: bool,
223 },
224}
225
226#[derive(Clone, Copy, Debug, clap::ValueEnum)]
228pub enum InputFormatArg {
229 Dict,
230 Fai,
231 Fasta,
232 NcbiReport,
233 Sam,
234 Bam,
235 Cram,
236 Vcf,
237 Tsv,
238}
239
240impl From<InputFormatArg> for InputFormat {
241 fn from(arg: InputFormatArg) -> Self {
242 match arg {
243 InputFormatArg::Dict => InputFormat::Dict,
244 InputFormatArg::Fai => InputFormat::Fai,
245 InputFormatArg::Fasta => InputFormat::Fasta,
246 InputFormatArg::NcbiReport => InputFormat::NcbiReport,
247 InputFormatArg::Sam => InputFormat::Sam,
248 InputFormatArg::Bam => InputFormat::Bam,
249 InputFormatArg::Cram => InputFormat::Cram,
250 InputFormatArg::Vcf => InputFormat::Vcf,
251 InputFormatArg::Tsv => InputFormat::Tsv,
252 }
253 }
254}
255
256pub fn run(args: CatalogArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
262 match args.command {
263 CatalogCommands::List {
264 catalog,
265 assembly,
266 source,
267 } => run_list(
268 catalog,
269 assembly.as_deref(),
270 source.as_deref(),
271 format,
272 verbose,
273 ),
274 CatalogCommands::Show {
275 id,
276 catalog,
277 all_contigs,
278 } => run_show(id, catalog, all_contigs, format),
279 CatalogCommands::Export { output, catalog } => run_export(output, catalog),
280 CatalogCommands::ListHierarchical { catalog } => {
281 run_list_hierarchical(catalog, format, verbose)
282 }
283 CatalogCommands::BuildHierarchical {
284 id,
285 name,
286 inputs,
287 assembly_id,
288 version_id,
289 source,
290 download_url,
291 tags,
292 output,
293 append_to,
294 force,
295 require_md5,
296 infer_assembly,
297 no_generate_ucsc_names,
298 } => run_build_hierarchical(
299 id,
300 name,
301 inputs,
302 assembly_id,
303 version_id,
304 source,
305 download_url,
306 tags,
307 output,
308 append_to,
309 force,
310 require_md5,
311 infer_assembly,
312 !no_generate_ucsc_names, format,
314 verbose,
315 ),
316 CatalogCommands::Build {
317 id,
318 name,
319 inputs,
320 assembly,
321 source,
322 description,
323 download_url,
324 assembly_report_url,
325 tags,
326 species,
327 output,
328 append_to,
329 force,
330 input_format,
331 require_md5,
332 no_generate_ucsc_names,
333 } => run_build(
334 id,
335 name,
336 inputs,
337 assembly,
338 source,
339 description,
340 download_url,
341 assembly_report_url,
342 tags,
343 species,
344 output,
345 append_to,
346 force,
347 input_format,
348 require_md5,
349 !no_generate_ucsc_names, format,
351 verbose,
352 ),
353 }
354}
355
356#[allow(clippy::too_many_lines)] fn run_list(
358 catalog_path: Option<PathBuf>,
359 assembly_filter: Option<&str>,
360 source_filter: Option<&str>,
361 format: OutputFormat,
362 verbose: bool,
363) -> anyhow::Result<()> {
364 let catalog = if let Some(path) = catalog_path {
365 ReferenceCatalog::load_from_file(&path)?
366 } else {
367 ReferenceCatalog::load_embedded()?
368 };
369
370 if verbose {
371 eprintln!("Loaded catalog with {} references", catalog.len());
372 }
373
374 let filtered: Vec<_> = catalog
376 .references
377 .iter()
378 .filter(|r| {
379 if let Some(assembly) = &assembly_filter {
380 let ref_assembly = format!("{}", r.assembly).to_lowercase();
381 if !ref_assembly.contains(&assembly.to_lowercase()) {
382 return false;
383 }
384 }
385 if let Some(source) = &source_filter {
386 let ref_source = format!("{}", r.source).to_lowercase();
387 if !ref_source.contains(&source.to_lowercase()) {
388 return false;
389 }
390 }
391 true
392 })
393 .collect();
394
395 match format {
396 OutputFormat::Text => {
397 let id_width = filtered
399 .iter()
400 .map(|r| r.id.0.len())
401 .max()
402 .unwrap_or(2)
403 .max(2);
404 let name_width = filtered
405 .iter()
406 .map(|r| r.display_name.len().min(35))
407 .max()
408 .unwrap_or(4)
409 .max(4);
410 let assembly_width = filtered
411 .iter()
412 .map(|r| format!("{}", r.assembly).len())
413 .max()
414 .unwrap_or(8)
415 .max(8);
416 let source_width = filtered
417 .iter()
418 .map(|r| format!("{}", r.source).len())
419 .max()
420 .unwrap_or(6)
421 .max(6);
422
423 let total_width = id_width + name_width + assembly_width + source_width + 8 + 8;
424
425 println!("Reference Catalog ({} references)\n", filtered.len());
426 println!(
427 "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
428 "ID",
429 "Name",
430 "Assembly",
431 "Source",
432 "Contigs",
433 id_w = id_width,
434 name_w = name_width,
435 asm_w = assembly_width,
436 src_w = source_width
437 );
438 println!("{}", "-".repeat(total_width));
439
440 for r in &filtered {
441 println!(
442 "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
443 r.id.0,
444 truncate(&r.display_name, name_width),
445 format!("{}", r.assembly),
446 format!("{}", r.source),
447 r.contigs.len(),
448 id_w = id_width,
449 name_w = name_width,
450 asm_w = assembly_width,
451 src_w = source_width
452 );
453 if verbose {
454 let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
455 let md5_pct = if r.contigs.is_empty() {
456 0.0
457 } else {
458 100.0 * count_to_f64(md5_count) / count_to_f64(r.contigs.len())
459 };
460 if let Some(url) = &r.download_url {
461 println!(
462 " └─ MD5: {}/{} ({:.0}%) URL: {}",
463 md5_count,
464 r.contigs.len(),
465 md5_pct,
466 url
467 );
468 } else {
469 println!(
470 " └─ MD5: {}/{} ({:.0}%)",
471 md5_count,
472 r.contigs.len(),
473 md5_pct
474 );
475 }
476 }
477 }
478 }
479 OutputFormat::Json => {
480 let output: Vec<serde_json::Value> = filtered
481 .iter()
482 .map(|r| {
483 let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
484 let role_counts = r.role_counts();
485 let mut json = serde_json::json!({
486 "id": r.id.0,
487 "display_name": r.display_name,
488 "assembly": format!("{}", r.assembly),
489 "source": format!("{}", r.source),
490 "contig_count": r.contigs.len(),
491 "md5_count": md5_count,
492 "has_decoy": r.has_decoy(),
493 "has_alt": r.has_alt(),
494 "fasta_url": r.download_url,
495 "assembly_report_url": r.assembly_report_url,
496 "role_counts": {
497 "assembled_molecule": role_counts.assembled_molecule,
498 "alt_scaffold": role_counts.alt_scaffold,
499 "fix_patch": role_counts.fix_patch,
500 "novel_patch": role_counts.novel_patch,
501 "unlocalized_scaffold": role_counts.unlocalized_scaffold,
502 "unplaced_scaffold": role_counts.unplaced_scaffold,
503 "unknown": role_counts.unknown,
504 },
505 "tags": r.tags,
506 });
507 if !r.contigs_missing_from_fasta.is_empty() {
509 json["contigs_missing_from_fasta"] =
510 serde_json::json!(&r.contigs_missing_from_fasta);
511 }
512 json
513 })
514 .collect();
515 println!("{}", serde_json::to_string_pretty(&output)?);
516 }
517 OutputFormat::Tsv => {
518 println!("id\tdisplay_name\tassembly\tsource\tcontig_count\tmd5_count\thas_decoy\thas_alt\tdownload_url");
519 for r in &filtered {
520 let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
521 println!(
522 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
523 r.id.0,
524 r.display_name,
525 r.assembly,
526 r.source,
527 r.contigs.len(),
528 md5_count,
529 r.has_decoy(),
530 r.has_alt(),
531 r.download_url.as_deref().unwrap_or("")
532 );
533 }
534 }
535 }
536
537 Ok(())
538}
539
540#[allow(clippy::needless_pass_by_value)] fn run_show(
542 id: String,
543 catalog_path: Option<PathBuf>,
544 all_contigs: bool,
545 format: OutputFormat,
546) -> anyhow::Result<()> {
547 let catalog = if let Some(path) = catalog_path {
548 ReferenceCatalog::load_from_file(&path)?
549 } else {
550 ReferenceCatalog::load_embedded()?
551 };
552
553 let ref_id = crate::core::types::ReferenceId::new(&id);
554 let reference = catalog
555 .get(&ref_id)
556 .ok_or_else(|| anyhow::anyhow!("Reference '{id}' not found"))?;
557
558 match format {
559 OutputFormat::Text => {
560 println!("Reference: {}\n", reference.display_name);
561 println!("ID: {}", reference.id);
562 println!("Assembly: {}", reference.assembly);
563 println!("Source: {}", reference.source);
564 println!("Naming: {:?}", reference.naming_convention);
565 println!("Contigs: {}", reference.contigs.len());
566 println!("Has Decoy: {}", reference.has_decoy());
567 println!("Has ALT: {}", reference.has_alt());
568
569 if let Some(desc) = &reference.description {
570 println!("\nDescription: {desc}");
571 }
572
573 if let Some(url) = &reference.download_url {
574 println!("\nDownload URL: {url}");
575 }
576
577 if !reference.tags.is_empty() {
578 println!("\nTags: {}", reference.tags.join(", "));
579 }
580
581 let contigs_to_show = if all_contigs {
582 &reference.contigs[..]
583 } else {
584 &reference.contigs[..reference.contigs.len().min(25)]
585 };
586
587 println!("\nContigs:");
588 println!("{:<25} {:>15} MD5", "Name", "Length");
589 println!("{}", "-".repeat(80));
590 for contig in contigs_to_show {
591 println!(
592 "{:<25} {:>15} {}",
593 contig.name,
594 contig.length,
595 contig.md5.as_deref().unwrap_or("-")
596 );
597 }
598
599 if !all_contigs && reference.contigs.len() > 25 {
600 println!(
601 "\n... and {} more contigs (use --all-contigs to show all)",
602 reference.contigs.len() - 25
603 );
604 }
605 }
606 OutputFormat::Json => {
607 println!("{}", serde_json::to_string_pretty(&reference)?);
608 }
609 OutputFormat::Tsv => {
610 println!("name\tlength\tmd5");
611 for contig in &reference.contigs {
612 println!(
613 "{}\t{}\t{}",
614 contig.name,
615 contig.length,
616 contig.md5.as_deref().unwrap_or("")
617 );
618 }
619 }
620 }
621
622 Ok(())
623}
624
625#[allow(clippy::needless_pass_by_value)] fn run_export(output: PathBuf, catalog_path: Option<PathBuf>) -> anyhow::Result<()> {
627 let catalog = if let Some(path) = catalog_path {
628 ReferenceCatalog::load_from_file(&path)?
629 } else {
630 ReferenceCatalog::load_embedded()?
631 };
632
633 let json = catalog.to_json()?;
634 std::fs::write(&output, json)?;
635
636 println!(
637 "Exported {} references to {}",
638 catalog.len(),
639 output.display()
640 );
641
642 Ok(())
643}
644
645#[allow(clippy::needless_pass_by_value, clippy::too_many_lines)] fn run_list_hierarchical(
647 catalog_path: PathBuf,
648 format: OutputFormat,
649 verbose: bool,
650) -> anyhow::Result<()> {
651 let catalog = HierarchicalCatalog::load(&catalog_path)?;
652
653 if verbose {
654 eprintln!(
655 "Loaded hierarchical catalog v{} with {} assemblies",
656 catalog.version,
657 catalog.assemblies.len()
658 );
659 }
660
661 match format {
662 OutputFormat::Text => {
663 println!("Hierarchical Reference Catalog (v{})\n", catalog.version);
664
665 let mut total_versions = 0;
667 let mut total_distributions = 0;
668 let mut total_contigs = 0;
669
670 for assembly in &catalog.assemblies {
671 total_versions += assembly.versions.len();
672 for version in &assembly.versions {
673 total_distributions += version.fasta_distributions.len();
674 for dist in &version.fasta_distributions {
675 total_contigs += dist.contigs.len();
676 }
677 }
678 }
679
680 println!(
681 "Summary: {} assemblies, {} versions, {} distributions, {} total contigs\n",
682 catalog.assemblies.len(),
683 total_versions,
684 total_distributions,
685 total_contigs
686 );
687
688 for assembly in &catalog.assemblies {
690 println!("{} ({})", assembly.name, assembly.id);
691 println!(" Organism: {}", assembly.organism);
692
693 for version in &assembly.versions {
694 println!("\n Version: {} ({})", version.version, version.id);
695
696 match &version.source {
698 crate::core::assembly::ReportSource::Ncbi { accession, .. } => {
699 println!(" Source: NCBI ({accession})");
700 }
701 crate::core::assembly::ReportSource::DerivedFromFasta {
702 base_assembly,
703 ..
704 } => {
705 if let Some(base) = base_assembly {
706 println!(" Source: Derived from FASTA (base: {base})");
707 } else {
708 println!(" Source: Derived from FASTA");
709 }
710 }
711 crate::core::assembly::ReportSource::Manual { .. } => {
712 println!(" Source: Manual");
713 }
714 }
715
716 if !version.report_contigs.is_empty() {
717 println!(" Report contigs: {}", version.report_contigs.len());
718 }
719
720 println!(" Distributions:");
721 for dist in &version.fasta_distributions {
722 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
723 println!(
724 " - {} ({}): {} contigs, {} with MD5",
725 dist.display_name,
726 dist.id,
727 dist.contigs.len(),
728 md5_count
729 );
730
731 if verbose {
732 let counts = dist.presence_counts();
734 if counts.in_both > 0 || counts.fasta_only > 0 {
735 println!(
736 " Presence: {} in-both, {} fasta-only",
737 counts.in_both, counts.fasta_only
738 );
739 }
740
741 if let Some(url) = &dist.download_url {
742 println!(" URL: {url}");
743 }
744 }
745 }
746 }
747 println!();
748 }
749
750 if !catalog.standalone_distributions.is_empty() {
752 println!("Standalone Distributions:");
753 for dist in &catalog.standalone_distributions {
754 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
755 println!(
756 " - {} ({}): {} contigs, {} with MD5",
757 dist.display_name,
758 dist.id,
759 dist.contigs.len(),
760 md5_count
761 );
762 }
763 }
764 }
765 OutputFormat::Json => {
766 println!("{}", serde_json::to_string_pretty(&catalog)?);
767 }
768 OutputFormat::Tsv => {
769 println!(
770 "assembly_id\tversion_id\tdistribution_id\tdisplay_name\tcontig_count\tmd5_count"
771 );
772 for assembly in &catalog.assemblies {
773 for version in &assembly.versions {
774 for dist in &version.fasta_distributions {
775 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
776 println!(
777 "{}\t{}\t{}\t{}\t{}\t{}",
778 assembly.id,
779 version.id,
780 dist.id,
781 dist.display_name,
782 dist.contigs.len(),
783 md5_count
784 );
785 }
786 }
787 }
788 for dist in &catalog.standalone_distributions {
790 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
791 println!(
792 "\t\t{}\t{}\t{}\t{}",
793 dist.id,
794 dist.display_name,
795 dist.contigs.len(),
796 md5_count
797 );
798 }
799 }
800 }
801
802 Ok(())
803}
804
805#[allow(
806 clippy::too_many_arguments,
807 clippy::needless_pass_by_value,
808 clippy::option_option,
809 clippy::too_many_lines,
810 clippy::fn_params_excessive_bools
811)] fn run_build_hierarchical(
813 id: String,
814 name: String,
815 inputs: Vec<PathBuf>,
816 assembly_id: Option<String>,
817 version_id: Option<String>,
818 source: Option<String>,
819 download_url: Option<String>,
820 tags: Option<String>,
821 output: Option<PathBuf>,
822 append_to: Option<PathBuf>,
823 force: bool,
824 require_md5: bool,
825 infer_assembly: Option<Option<PathBuf>>,
826 generate_ucsc_names: bool,
827 format: OutputFormat,
828 verbose: bool,
829) -> anyhow::Result<()> {
830 use crate::catalog::builder::DistributionBuilder;
831
832 let ref_source = source.map_or(ReferenceSource::Custom("custom".to_string()), |s| {
834 parse_reference_source(&s)
835 });
836
837 let tags: Vec<String> = tags
839 .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
840 .unwrap_or_default();
841
842 let mut builder = DistributionBuilder::new(&id)
844 .with_display_name(&name)
845 .with_source(ref_source)
846 .with_generate_ucsc_names(generate_ucsc_names);
847
848 if let Some(url) = download_url {
849 builder = builder.with_download_url(url);
850 }
851 if !tags.is_empty() {
852 builder = builder.with_tags(tags);
853 }
854
855 for input_path in &inputs {
857 if !input_path.exists() {
858 anyhow::bail!("Input file not found: {}", input_path.display());
859 }
860
861 if verbose {
862 eprintln!("Processing: {}", input_path.display());
863 }
864
865 builder.add_input(input_path)?;
866 }
867
868 let dist = builder.build()?;
870
871 if require_md5 {
873 let missing_md5: Vec<_> = dist
874 .contigs
875 .iter()
876 .filter(|c| c.md5.is_empty())
877 .map(|c| c.name.as_str())
878 .collect();
879
880 if !missing_md5.is_empty() {
881 anyhow::bail!(
882 "MD5 required but {} contig(s) lack MD5: {}",
883 missing_md5.len(),
884 missing_md5.join(", ")
885 );
886 }
887 }
888
889 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
891 if verbose {
892 eprintln!(
893 "Built distribution '{}' with {} contigs ({} with MD5)",
894 id,
895 dist.contigs.len(),
896 md5_count
897 );
898 }
899
900 let (inferred_assembly_id, inferred_version_id) = if infer_assembly.is_some() {
902 let infer_catalog = match &infer_assembly {
904 Some(Some(path)) => Some(HierarchicalCatalog::load(path)?),
905 Some(None) => {
906 if let Some(ref append_path) = append_to {
908 Some(HierarchicalCatalog::load(append_path)?)
909 } else {
910 if verbose {
911 eprintln!("Warning: No catalog specified for inference. Use --infer-assembly=<path> or --append-to");
912 }
913 None
914 }
915 }
916 None => None,
917 };
918
919 if let Some(ref catalog) = infer_catalog {
920 if let Some(inferred) = catalog.infer_base_assembly_default(&dist.contigs) {
921 if verbose {
922 eprintln!(
923 "Inferred base assembly: {} {} ({:.1}% match, {}/{} contigs)",
924 inferred.assembly_name,
925 inferred.version_string,
926 inferred.match_rate * 100.0,
927 inferred.matched_contigs,
928 inferred.total_input_contigs
929 );
930 }
931 (
932 assembly_id.clone().or(Some(inferred.assembly_id)),
933 version_id.clone().or(Some(inferred.version_id)),
934 )
935 } else {
936 if verbose {
937 eprintln!("Could not infer base assembly (no match above 90% threshold)");
938 }
939 (assembly_id.clone(), version_id.clone())
940 }
941 } else {
942 (assembly_id.clone(), version_id.clone())
943 }
944 } else {
945 (assembly_id.clone(), version_id.clone())
946 };
947
948 if let Some(append_path) = append_to {
950 let mut catalog = HierarchicalCatalog::load(&append_path)?;
952
953 if let (Some(asm_id), Some(ver_id)) = (&inferred_assembly_id, &inferred_version_id) {
955 let mut found = false;
956 for assembly in &mut catalog.assemblies {
957 if assembly.id == *asm_id {
958 for version in &mut assembly.versions {
959 if version.id == *ver_id {
960 if !force && version.fasta_distributions.iter().any(|d| d.id == id) {
962 anyhow::bail!(
963 "Distribution '{id}' already exists in version '{ver_id}'. Use --force to overwrite."
964 );
965 }
966
967 version.fasta_distributions.retain(|d| d.id != id);
969 version.fasta_distributions.push(dist.clone());
970 found = true;
971 break;
972 }
973 }
974 }
975 }
976 if !found {
977 anyhow::bail!("Assembly '{asm_id}' with version '{ver_id}' not found in catalog");
978 }
979 } else {
980 if !force && catalog.standalone_distributions.iter().any(|d| d.id == id) {
982 anyhow::bail!(
983 "Standalone distribution '{id}' already exists. Use --force to overwrite."
984 );
985 }
986 catalog.standalone_distributions.retain(|d| d.id != id);
987 catalog.standalone_distributions.push(dist.clone());
988 }
989
990 catalog.save(&append_path)?;
991 eprintln!("Added distribution '{}' to {}", id, append_path.display());
992 } else if let Some(out_path) = output {
993 if out_path.exists() && !force {
995 anyhow::bail!(
996 "Output file '{}' exists. Use --force to overwrite.",
997 out_path.display()
998 );
999 }
1000
1001 if let OutputFormat::Json = format {
1003 let json = serde_json::to_string_pretty(&dist)?;
1005 std::fs::write(&out_path, json)?;
1006 eprintln!("Wrote distribution to {}", out_path.display());
1007 } else {
1008 let catalog = HierarchicalCatalog::new().with_standalone_distribution(dist);
1010 catalog.save(&out_path)?;
1011 eprintln!("Wrote hierarchical catalog to {}", out_path.display());
1012 }
1013 } else {
1014 match format {
1016 OutputFormat::Json => {
1017 println!("{}", serde_json::to_string_pretty(&dist)?);
1018 }
1019 OutputFormat::Text => {
1020 print_distribution_summary(&dist);
1021 }
1022 OutputFormat::Tsv => {
1023 println!("name\tlength\tmd5\treport_contig_id");
1024 for c in &dist.contigs {
1025 println!(
1026 "{}\t{}\t{}\t{}",
1027 c.name,
1028 c.length,
1029 c.md5,
1030 c.report_contig_id
1031 .map(|i| i.to_string())
1032 .unwrap_or_default()
1033 );
1034 }
1035 }
1036 }
1037 }
1038
1039 Ok(())
1040}
1041
1042fn print_distribution_summary(dist: &crate::core::assembly::FastaDistribution) {
1043 println!("Distribution: {} ({})", dist.display_name, dist.id);
1044 println!("Source: {:?}", dist.source);
1045 if let Some(url) = &dist.download_url {
1046 println!("Download URL: {url}");
1047 }
1048 if !dist.tags.is_empty() {
1049 println!("Tags: {}", dist.tags.join(", "));
1050 }
1051 println!("Contigs: {}", dist.contigs.len());
1052
1053 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
1054 println!("With MD5: {md5_count}");
1055
1056 let linked = dist
1057 .contigs
1058 .iter()
1059 .filter(|c| c.report_contig_id.is_some())
1060 .count();
1061 println!("Linked to report: {linked}");
1062
1063 let counts = dist.presence_counts();
1065 if counts.in_both > 0 || counts.fasta_only > 0 {
1066 println!(
1067 "Presence: {} in-both, {} fasta-only",
1068 counts.in_both, counts.fasta_only
1069 );
1070 }
1071}
1072
1073fn parse_reference_source(s: &str) -> ReferenceSource {
1074 match s.to_lowercase().as_str() {
1075 "ucsc" => ReferenceSource::Ucsc,
1076 "ncbi" => ReferenceSource::Ncbi,
1077 "broad" => ReferenceSource::Broad,
1078 "ensembl" => ReferenceSource::Ensembl,
1079 "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1080 "dragen" | "illumina" => ReferenceSource::Illumina,
1081 _ => ReferenceSource::Custom(s.to_string()),
1082 }
1083}
1084
1085fn truncate(s: &str, max_len: usize) -> String {
1086 if s.len() <= max_len {
1087 s.to_string()
1088 } else {
1089 format!("{}...", &s[..max_len - 3])
1090 }
1091}
1092
1093#[allow(
1094 clippy::too_many_arguments,
1095 clippy::needless_pass_by_value,
1096 clippy::too_many_lines,
1097 clippy::fn_params_excessive_bools
1098)] fn run_build(
1100 id: String,
1101 name: String,
1102 inputs: Vec<PathBuf>,
1103 assembly: Option<String>,
1104 source: Option<String>,
1105 description: Option<String>,
1106 download_url: Option<String>,
1107 assembly_report_url: Option<String>,
1108 tags: Option<String>,
1109 species: Option<String>,
1110 output: Option<PathBuf>,
1111 append_to: Option<PathBuf>,
1112 force: bool,
1113 input_format: Option<InputFormatArg>,
1114 require_md5: bool,
1115 generate_ucsc_names: bool,
1116 format: OutputFormat,
1117 verbose: bool,
1118) -> anyhow::Result<()> {
1119 let assembly = assembly.map(|s| parse_assembly(&s));
1121
1122 let source = source.map(|s| parse_source(&s));
1124
1125 let tags: Vec<String> = tags
1127 .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
1128 .unwrap_or_default();
1129
1130 let mut builder = ReferenceBuilder::new(&id, &name).generate_ucsc_names(generate_ucsc_names);
1132
1133 if let Some(assembly) = assembly {
1134 builder = builder.assembly(assembly);
1135 }
1136 if let Some(source) = source {
1137 builder = builder.source(source);
1138 }
1139 if let Some(desc) = description {
1140 builder = builder.description(desc);
1141 }
1142 if let Some(url) = download_url {
1143 builder = builder.download_url(url);
1144 }
1145 if let Some(url) = assembly_report_url {
1146 builder = builder.assembly_report_url(url);
1147 }
1148 if !tags.is_empty() {
1149 builder = builder.tags(tags);
1150 }
1151 if let Some(sp) = species {
1152 builder = builder.species(sp);
1153 }
1154
1155 for input_path in &inputs {
1157 if !input_path.exists() {
1158 anyhow::bail!("Input file not found: {}", input_path.display());
1159 }
1160
1161 if verbose {
1162 eprintln!("Processing: {}", input_path.display());
1163 }
1164
1165 if let Some(fmt) = input_format {
1166 builder.add_input_with_format(input_path, fmt.into())?;
1167 } else {
1168 builder.add_input(input_path)?;
1169 }
1170 }
1171
1172 let summary = builder.summary();
1174
1175 if !summary.conflicts.is_empty() {
1177 eprintln!("Build failed due to conflicts:");
1178 for conflict in &summary.conflicts {
1179 eprintln!(" - {conflict}");
1180 }
1181 anyhow::bail!(
1182 "Build failed: {} conflict(s) detected",
1183 summary.conflicts.len()
1184 );
1185 }
1186
1187 if require_md5 && summary.with_md5 < summary.total_contigs {
1189 anyhow::bail!(
1190 "Build failed: --require-md5 specified but only {}/{} contigs have MD5",
1191 summary.with_md5,
1192 summary.total_contigs
1193 );
1194 }
1195
1196 let reference = builder.build()?;
1198
1199 if verbose || matches!(format, OutputFormat::Text) {
1201 eprintln!("{summary}");
1202 }
1203
1204 if let Some(catalog_path) = append_to {
1206 let mut catalog = if catalog_path.exists() {
1208 ReferenceCatalog::load_from_file(&catalog_path)?
1209 } else {
1210 ReferenceCatalog::new()
1211 };
1212
1213 let ref_id = crate::core::types::ReferenceId::new(&id);
1215 if catalog.get(&ref_id).is_some() {
1216 if force {
1217 eprintln!("Warning: Overwriting existing reference '{id}'");
1218 let refs: Vec<_> = catalog
1220 .references
1221 .into_iter()
1222 .filter(|r| r.id != ref_id)
1223 .collect();
1224 catalog = ReferenceCatalog::new();
1225 for r in refs {
1226 catalog.add_reference(r);
1227 }
1228 } else {
1229 anyhow::bail!(
1230 "Reference '{id}' already exists in catalog. Use --force to overwrite."
1231 );
1232 }
1233 }
1234
1235 catalog.add_reference(reference);
1236 let json = catalog.to_json()?;
1237 std::fs::write(&catalog_path, json)?;
1238
1239 println!(
1240 "Added reference '{}' to {} ({} total references)",
1241 id,
1242 catalog_path.display(),
1243 catalog.len()
1244 );
1245 } else if let Some(output_path) = output {
1246 let json = serde_json::to_string_pretty(&reference)?;
1248 std::fs::write(&output_path, &json)?;
1249 println!("Wrote reference '{}' to {}", id, output_path.display());
1250 } else {
1251 match format {
1253 OutputFormat::Json => {
1254 println!("{}", serde_json::to_string_pretty(&reference)?);
1255 }
1256 OutputFormat::Text | OutputFormat::Tsv => {
1257 println!("Reference: {}", reference.display_name);
1259 println!("ID: {}", reference.id);
1260 println!("Assembly: {}", reference.assembly);
1261 println!("Source: {}", reference.source);
1262 println!("Contigs: {}", reference.contigs.len());
1263 println!();
1264 println!("Use --output <file> to save as JSON");
1265 }
1266 }
1267 }
1268
1269 Ok(())
1270}
1271
1272fn parse_assembly(s: &str) -> Assembly {
1273 let lower = s.to_lowercase();
1274 match lower.as_str() {
1275 "grch37" | "hg19" | "b37" => Assembly::Grch37,
1276 "grch38" | "hg38" => Assembly::Grch38,
1277 _ => Assembly::Other(s.to_string()),
1278 }
1279}
1280
1281fn parse_source(s: &str) -> ReferenceSource {
1282 let lower = s.to_lowercase();
1283 match lower.as_str() {
1284 "ucsc" => ReferenceSource::Ucsc,
1285 "ncbi" | "grc" => ReferenceSource::Ncbi,
1286 "broad" => ReferenceSource::Broad,
1287 "ensembl" => ReferenceSource::Ensembl,
1288 "illumina" | "dragen" => ReferenceSource::Illumina,
1289 "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1290 _ => ReferenceSource::Custom(s.to_string()),
1291 }
1292}