1use std::path::PathBuf;
2
3use clap::{Args, Subcommand};
4
5use crate::catalog::builder::{InputFormat, ReferenceBuilder};
6use crate::catalog::hierarchical::HierarchicalCatalog;
7use crate::catalog::store::ReferenceCatalog;
8use crate::cli::OutputFormat;
9use crate::core::types::{Assembly, ReferenceSource};
10
11#[inline]
13fn count_to_f64(count: usize) -> f64 {
14 #[allow(clippy::cast_precision_loss)]
15 {
16 count as f64
17 }
18}
19
20#[derive(Args)]
21pub struct CatalogArgs {
22 #[command(subcommand)]
23 pub command: CatalogCommands,
24}
25
26#[derive(Subcommand)]
27pub enum CatalogCommands {
28 List {
30 #[arg(long)]
32 catalog: Option<PathBuf>,
33
34 #[arg(long)]
36 assembly: Option<String>,
37
38 #[arg(long)]
40 source: Option<String>,
41 },
42
43 Show {
45 #[arg(required = true)]
47 id: String,
48
49 #[arg(long)]
51 catalog: Option<PathBuf>,
52
53 #[arg(long)]
55 all_contigs: bool,
56 },
57
58 Export {
60 #[arg(required = true)]
62 output: PathBuf,
63
64 #[arg(long)]
66 catalog: Option<PathBuf>,
67 },
68
69 ListHierarchical {
71 #[arg(required = true)]
73 catalog: PathBuf,
74 },
75
76 BuildHierarchical {
78 #[arg(long, required = true)]
80 id: String,
81
82 #[arg(long, required = true)]
84 name: String,
85
86 #[arg(short, long = "input", required = true, num_args = 1..)]
88 inputs: Vec<PathBuf>,
89
90 #[arg(long)]
92 assembly_id: Option<String>,
93
94 #[arg(long)]
96 version_id: Option<String>,
97
98 #[arg(long)]
100 source: Option<String>,
101
102 #[arg(long)]
104 download_url: Option<String>,
105
106 #[arg(long)]
108 tags: Option<String>,
109
110 #[arg(short, long)]
112 output: Option<PathBuf>,
113
114 #[arg(long)]
116 append_to: Option<PathBuf>,
117
118 #[arg(long)]
120 force: bool,
121
122 #[arg(long)]
124 require_md5: bool,
125
126 #[arg(long)]
129 #[allow(clippy::option_option)]
130 infer_assembly: Option<Option<PathBuf>>,
132
133 #[arg(long)]
144 no_generate_ucsc_names: bool,
145 },
146
147 Build {
149 #[arg(long, required = true)]
151 id: String,
152
153 #[arg(long, required = true)]
155 name: String,
156
157 #[arg(short, long = "input", required = true, num_args = 1..)]
160 inputs: Vec<PathBuf>,
161
162 #[arg(long)]
164 assembly: Option<String>,
165
166 #[arg(long)]
168 source: Option<String>,
169
170 #[arg(long)]
172 description: Option<String>,
173
174 #[arg(long)]
176 download_url: Option<String>,
177
178 #[arg(long)]
180 assembly_report_url: Option<String>,
181
182 #[arg(long)]
184 tags: Option<String>,
185
186 #[arg(short, long)]
188 output: Option<PathBuf>,
189
190 #[arg(long)]
192 append_to: Option<PathBuf>,
193
194 #[arg(long)]
196 force: bool,
197
198 #[arg(long, value_enum)]
200 input_format: Option<InputFormatArg>,
201
202 #[arg(long)]
204 require_md5: bool,
205
206 #[arg(long)]
217 no_generate_ucsc_names: bool,
218 },
219}
220
221#[derive(Clone, Copy, Debug, clap::ValueEnum)]
223pub enum InputFormatArg {
224 Dict,
225 Fai,
226 Fasta,
227 NcbiReport,
228 Sam,
229 Bam,
230 Cram,
231 Vcf,
232 Tsv,
233}
234
235impl From<InputFormatArg> for InputFormat {
236 fn from(arg: InputFormatArg) -> Self {
237 match arg {
238 InputFormatArg::Dict => InputFormat::Dict,
239 InputFormatArg::Fai => InputFormat::Fai,
240 InputFormatArg::Fasta => InputFormat::Fasta,
241 InputFormatArg::NcbiReport => InputFormat::NcbiReport,
242 InputFormatArg::Sam => InputFormat::Sam,
243 InputFormatArg::Bam => InputFormat::Bam,
244 InputFormatArg::Cram => InputFormat::Cram,
245 InputFormatArg::Vcf => InputFormat::Vcf,
246 InputFormatArg::Tsv => InputFormat::Tsv,
247 }
248 }
249}
250
251pub fn run(args: CatalogArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
257 match args.command {
258 CatalogCommands::List {
259 catalog,
260 assembly,
261 source,
262 } => run_list(
263 catalog,
264 assembly.as_deref(),
265 source.as_deref(),
266 format,
267 verbose,
268 ),
269 CatalogCommands::Show {
270 id,
271 catalog,
272 all_contigs,
273 } => run_show(id, catalog, all_contigs, format),
274 CatalogCommands::Export { output, catalog } => run_export(output, catalog),
275 CatalogCommands::ListHierarchical { catalog } => {
276 run_list_hierarchical(catalog, format, verbose)
277 }
278 CatalogCommands::BuildHierarchical {
279 id,
280 name,
281 inputs,
282 assembly_id,
283 version_id,
284 source,
285 download_url,
286 tags,
287 output,
288 append_to,
289 force,
290 require_md5,
291 infer_assembly,
292 no_generate_ucsc_names,
293 } => run_build_hierarchical(
294 id,
295 name,
296 inputs,
297 assembly_id,
298 version_id,
299 source,
300 download_url,
301 tags,
302 output,
303 append_to,
304 force,
305 require_md5,
306 infer_assembly,
307 !no_generate_ucsc_names, format,
309 verbose,
310 ),
311 CatalogCommands::Build {
312 id,
313 name,
314 inputs,
315 assembly,
316 source,
317 description,
318 download_url,
319 assembly_report_url,
320 tags,
321 output,
322 append_to,
323 force,
324 input_format,
325 require_md5,
326 no_generate_ucsc_names,
327 } => run_build(
328 id,
329 name,
330 inputs,
331 assembly,
332 source,
333 description,
334 download_url,
335 assembly_report_url,
336 tags,
337 output,
338 append_to,
339 force,
340 input_format,
341 require_md5,
342 !no_generate_ucsc_names, format,
344 verbose,
345 ),
346 }
347}
348
349#[allow(clippy::too_many_lines)] fn run_list(
351 catalog_path: Option<PathBuf>,
352 assembly_filter: Option<&str>,
353 source_filter: Option<&str>,
354 format: OutputFormat,
355 verbose: bool,
356) -> anyhow::Result<()> {
357 let catalog = if let Some(path) = catalog_path {
358 ReferenceCatalog::load_from_file(&path)?
359 } else {
360 ReferenceCatalog::load_embedded()?
361 };
362
363 if verbose {
364 eprintln!("Loaded catalog with {} references", catalog.len());
365 }
366
367 let filtered: Vec<_> = catalog
369 .references
370 .iter()
371 .filter(|r| {
372 if let Some(assembly) = &assembly_filter {
373 let ref_assembly = format!("{}", r.assembly).to_lowercase();
374 if !ref_assembly.contains(&assembly.to_lowercase()) {
375 return false;
376 }
377 }
378 if let Some(source) = &source_filter {
379 let ref_source = format!("{}", r.source).to_lowercase();
380 if !ref_source.contains(&source.to_lowercase()) {
381 return false;
382 }
383 }
384 true
385 })
386 .collect();
387
388 match format {
389 OutputFormat::Text => {
390 let id_width = filtered
392 .iter()
393 .map(|r| r.id.0.len())
394 .max()
395 .unwrap_or(2)
396 .max(2);
397 let name_width = filtered
398 .iter()
399 .map(|r| r.display_name.len().min(35))
400 .max()
401 .unwrap_or(4)
402 .max(4);
403 let assembly_width = filtered
404 .iter()
405 .map(|r| format!("{}", r.assembly).len())
406 .max()
407 .unwrap_or(8)
408 .max(8);
409 let source_width = filtered
410 .iter()
411 .map(|r| format!("{}", r.source).len())
412 .max()
413 .unwrap_or(6)
414 .max(6);
415
416 let total_width = id_width + name_width + assembly_width + source_width + 8 + 8;
417
418 println!("Reference Catalog ({} references)\n", filtered.len());
419 println!(
420 "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
421 "ID",
422 "Name",
423 "Assembly",
424 "Source",
425 "Contigs",
426 id_w = id_width,
427 name_w = name_width,
428 asm_w = assembly_width,
429 src_w = source_width
430 );
431 println!("{}", "-".repeat(total_width));
432
433 for r in &filtered {
434 println!(
435 "{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
436 r.id.0,
437 truncate(&r.display_name, name_width),
438 format!("{}", r.assembly),
439 format!("{}", r.source),
440 r.contigs.len(),
441 id_w = id_width,
442 name_w = name_width,
443 asm_w = assembly_width,
444 src_w = source_width
445 );
446 if verbose {
447 let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
448 let md5_pct = if r.contigs.is_empty() {
449 0.0
450 } else {
451 100.0 * count_to_f64(md5_count) / count_to_f64(r.contigs.len())
452 };
453 if let Some(url) = &r.download_url {
454 println!(
455 " └─ MD5: {}/{} ({:.0}%) URL: {}",
456 md5_count,
457 r.contigs.len(),
458 md5_pct,
459 url
460 );
461 } else {
462 println!(
463 " └─ MD5: {}/{} ({:.0}%)",
464 md5_count,
465 r.contigs.len(),
466 md5_pct
467 );
468 }
469 }
470 }
471 }
472 OutputFormat::Json => {
473 let output: Vec<serde_json::Value> = filtered
474 .iter()
475 .map(|r| {
476 let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
477 let role_counts = r.role_counts();
478 let mut json = serde_json::json!({
479 "id": r.id.0,
480 "display_name": r.display_name,
481 "assembly": format!("{}", r.assembly),
482 "source": format!("{}", r.source),
483 "contig_count": r.contigs.len(),
484 "md5_count": md5_count,
485 "has_decoy": r.has_decoy(),
486 "has_alt": r.has_alt(),
487 "fasta_url": r.download_url,
488 "assembly_report_url": r.assembly_report_url,
489 "role_counts": {
490 "assembled_molecule": role_counts.assembled_molecule,
491 "alt_scaffold": role_counts.alt_scaffold,
492 "fix_patch": role_counts.fix_patch,
493 "novel_patch": role_counts.novel_patch,
494 "unlocalized_scaffold": role_counts.unlocalized_scaffold,
495 "unplaced_scaffold": role_counts.unplaced_scaffold,
496 "unknown": role_counts.unknown,
497 },
498 "tags": r.tags,
499 });
500 if !r.contigs_missing_from_fasta.is_empty() {
502 json["contigs_missing_from_fasta"] =
503 serde_json::json!(&r.contigs_missing_from_fasta);
504 }
505 json
506 })
507 .collect();
508 println!("{}", serde_json::to_string_pretty(&output)?);
509 }
510 OutputFormat::Tsv => {
511 println!("id\tdisplay_name\tassembly\tsource\tcontig_count\tmd5_count\thas_decoy\thas_alt\tdownload_url");
512 for r in &filtered {
513 let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
514 println!(
515 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
516 r.id.0,
517 r.display_name,
518 r.assembly,
519 r.source,
520 r.contigs.len(),
521 md5_count,
522 r.has_decoy(),
523 r.has_alt(),
524 r.download_url.as_deref().unwrap_or("")
525 );
526 }
527 }
528 }
529
530 Ok(())
531}
532
533#[allow(clippy::needless_pass_by_value)] fn run_show(
535 id: String,
536 catalog_path: Option<PathBuf>,
537 all_contigs: bool,
538 format: OutputFormat,
539) -> anyhow::Result<()> {
540 let catalog = if let Some(path) = catalog_path {
541 ReferenceCatalog::load_from_file(&path)?
542 } else {
543 ReferenceCatalog::load_embedded()?
544 };
545
546 let ref_id = crate::core::types::ReferenceId::new(&id);
547 let reference = catalog
548 .get(&ref_id)
549 .ok_or_else(|| anyhow::anyhow!("Reference '{id}' not found"))?;
550
551 match format {
552 OutputFormat::Text => {
553 println!("Reference: {}\n", reference.display_name);
554 println!("ID: {}", reference.id);
555 println!("Assembly: {}", reference.assembly);
556 println!("Source: {}", reference.source);
557 println!("Naming: {:?}", reference.naming_convention);
558 println!("Contigs: {}", reference.contigs.len());
559 println!("Has Decoy: {}", reference.has_decoy());
560 println!("Has ALT: {}", reference.has_alt());
561
562 if let Some(desc) = &reference.description {
563 println!("\nDescription: {desc}");
564 }
565
566 if let Some(url) = &reference.download_url {
567 println!("\nDownload URL: {url}");
568 }
569
570 if !reference.tags.is_empty() {
571 println!("\nTags: {}", reference.tags.join(", "));
572 }
573
574 let contigs_to_show = if all_contigs {
575 &reference.contigs[..]
576 } else {
577 &reference.contigs[..reference.contigs.len().min(25)]
578 };
579
580 println!("\nContigs:");
581 println!("{:<25} {:>15} MD5", "Name", "Length");
582 println!("{}", "-".repeat(80));
583 for contig in contigs_to_show {
584 println!(
585 "{:<25} {:>15} {}",
586 contig.name,
587 contig.length,
588 contig.md5.as_deref().unwrap_or("-")
589 );
590 }
591
592 if !all_contigs && reference.contigs.len() > 25 {
593 println!(
594 "\n... and {} more contigs (use --all-contigs to show all)",
595 reference.contigs.len() - 25
596 );
597 }
598 }
599 OutputFormat::Json => {
600 println!("{}", serde_json::to_string_pretty(&reference)?);
601 }
602 OutputFormat::Tsv => {
603 println!("name\tlength\tmd5");
604 for contig in &reference.contigs {
605 println!(
606 "{}\t{}\t{}",
607 contig.name,
608 contig.length,
609 contig.md5.as_deref().unwrap_or("")
610 );
611 }
612 }
613 }
614
615 Ok(())
616}
617
618#[allow(clippy::needless_pass_by_value)] fn run_export(output: PathBuf, catalog_path: Option<PathBuf>) -> anyhow::Result<()> {
620 let catalog = if let Some(path) = catalog_path {
621 ReferenceCatalog::load_from_file(&path)?
622 } else {
623 ReferenceCatalog::load_embedded()?
624 };
625
626 let json = catalog.to_json()?;
627 std::fs::write(&output, json)?;
628
629 println!(
630 "Exported {} references to {}",
631 catalog.len(),
632 output.display()
633 );
634
635 Ok(())
636}
637
638#[allow(clippy::needless_pass_by_value, clippy::too_many_lines)] fn run_list_hierarchical(
640 catalog_path: PathBuf,
641 format: OutputFormat,
642 verbose: bool,
643) -> anyhow::Result<()> {
644 let catalog = HierarchicalCatalog::load(&catalog_path)?;
645
646 if verbose {
647 eprintln!(
648 "Loaded hierarchical catalog v{} with {} assemblies",
649 catalog.version,
650 catalog.assemblies.len()
651 );
652 }
653
654 match format {
655 OutputFormat::Text => {
656 println!("Hierarchical Reference Catalog (v{})\n", catalog.version);
657
658 let mut total_versions = 0;
660 let mut total_distributions = 0;
661 let mut total_contigs = 0;
662
663 for assembly in &catalog.assemblies {
664 total_versions += assembly.versions.len();
665 for version in &assembly.versions {
666 total_distributions += version.fasta_distributions.len();
667 for dist in &version.fasta_distributions {
668 total_contigs += dist.contigs.len();
669 }
670 }
671 }
672
673 println!(
674 "Summary: {} assemblies, {} versions, {} distributions, {} total contigs\n",
675 catalog.assemblies.len(),
676 total_versions,
677 total_distributions,
678 total_contigs
679 );
680
681 for assembly in &catalog.assemblies {
683 println!("{} ({})", assembly.name, assembly.id);
684 println!(" Organism: {}", assembly.organism);
685
686 for version in &assembly.versions {
687 println!("\n Version: {} ({})", version.version, version.id);
688
689 match &version.source {
691 crate::core::assembly::ReportSource::Ncbi { accession, .. } => {
692 println!(" Source: NCBI ({accession})");
693 }
694 crate::core::assembly::ReportSource::DerivedFromFasta {
695 base_assembly,
696 ..
697 } => {
698 if let Some(base) = base_assembly {
699 println!(" Source: Derived from FASTA (base: {base})");
700 } else {
701 println!(" Source: Derived from FASTA");
702 }
703 }
704 crate::core::assembly::ReportSource::Manual { .. } => {
705 println!(" Source: Manual");
706 }
707 }
708
709 if !version.report_contigs.is_empty() {
710 println!(" Report contigs: {}", version.report_contigs.len());
711 }
712
713 println!(" Distributions:");
714 for dist in &version.fasta_distributions {
715 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
716 println!(
717 " - {} ({}): {} contigs, {} with MD5",
718 dist.display_name,
719 dist.id,
720 dist.contigs.len(),
721 md5_count
722 );
723
724 if verbose {
725 let counts = dist.presence_counts();
727 if counts.in_both > 0 || counts.fasta_only > 0 {
728 println!(
729 " Presence: {} in-both, {} fasta-only",
730 counts.in_both, counts.fasta_only
731 );
732 }
733
734 if let Some(url) = &dist.download_url {
735 println!(" URL: {url}");
736 }
737 }
738 }
739 }
740 println!();
741 }
742
743 if !catalog.standalone_distributions.is_empty() {
745 println!("Standalone Distributions:");
746 for dist in &catalog.standalone_distributions {
747 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
748 println!(
749 " - {} ({}): {} contigs, {} with MD5",
750 dist.display_name,
751 dist.id,
752 dist.contigs.len(),
753 md5_count
754 );
755 }
756 }
757 }
758 OutputFormat::Json => {
759 println!("{}", serde_json::to_string_pretty(&catalog)?);
760 }
761 OutputFormat::Tsv => {
762 println!(
763 "assembly_id\tversion_id\tdistribution_id\tdisplay_name\tcontig_count\tmd5_count"
764 );
765 for assembly in &catalog.assemblies {
766 for version in &assembly.versions {
767 for dist in &version.fasta_distributions {
768 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
769 println!(
770 "{}\t{}\t{}\t{}\t{}\t{}",
771 assembly.id,
772 version.id,
773 dist.id,
774 dist.display_name,
775 dist.contigs.len(),
776 md5_count
777 );
778 }
779 }
780 }
781 for dist in &catalog.standalone_distributions {
783 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
784 println!(
785 "\t\t{}\t{}\t{}\t{}",
786 dist.id,
787 dist.display_name,
788 dist.contigs.len(),
789 md5_count
790 );
791 }
792 }
793 }
794
795 Ok(())
796}
797
798#[allow(
799 clippy::too_many_arguments,
800 clippy::needless_pass_by_value,
801 clippy::option_option,
802 clippy::too_many_lines,
803 clippy::fn_params_excessive_bools
804)] fn run_build_hierarchical(
806 id: String,
807 name: String,
808 inputs: Vec<PathBuf>,
809 assembly_id: Option<String>,
810 version_id: Option<String>,
811 source: Option<String>,
812 download_url: Option<String>,
813 tags: Option<String>,
814 output: Option<PathBuf>,
815 append_to: Option<PathBuf>,
816 force: bool,
817 require_md5: bool,
818 infer_assembly: Option<Option<PathBuf>>,
819 generate_ucsc_names: bool,
820 format: OutputFormat,
821 verbose: bool,
822) -> anyhow::Result<()> {
823 use crate::catalog::builder::DistributionBuilder;
824
825 let ref_source = source.map_or(ReferenceSource::Custom("custom".to_string()), |s| {
827 parse_reference_source(&s)
828 });
829
830 let tags: Vec<String> = tags
832 .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
833 .unwrap_or_default();
834
835 let mut builder = DistributionBuilder::new(&id)
837 .with_display_name(&name)
838 .with_source(ref_source)
839 .with_generate_ucsc_names(generate_ucsc_names);
840
841 if let Some(url) = download_url {
842 builder = builder.with_download_url(url);
843 }
844 if !tags.is_empty() {
845 builder = builder.with_tags(tags);
846 }
847
848 for input_path in &inputs {
850 if !input_path.exists() {
851 anyhow::bail!("Input file not found: {}", input_path.display());
852 }
853
854 if verbose {
855 eprintln!("Processing: {}", input_path.display());
856 }
857
858 builder.add_input(input_path)?;
859 }
860
861 let dist = builder.build()?;
863
864 if require_md5 {
866 let missing_md5: Vec<_> = dist
867 .contigs
868 .iter()
869 .filter(|c| c.md5.is_empty())
870 .map(|c| c.name.as_str())
871 .collect();
872
873 if !missing_md5.is_empty() {
874 anyhow::bail!(
875 "MD5 required but {} contig(s) lack MD5: {}",
876 missing_md5.len(),
877 missing_md5.join(", ")
878 );
879 }
880 }
881
882 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
884 if verbose {
885 eprintln!(
886 "Built distribution '{}' with {} contigs ({} with MD5)",
887 id,
888 dist.contigs.len(),
889 md5_count
890 );
891 }
892
893 let (inferred_assembly_id, inferred_version_id) = if infer_assembly.is_some() {
895 let infer_catalog = match &infer_assembly {
897 Some(Some(path)) => Some(HierarchicalCatalog::load(path)?),
898 Some(None) => {
899 if let Some(ref append_path) = append_to {
901 Some(HierarchicalCatalog::load(append_path)?)
902 } else {
903 if verbose {
904 eprintln!("Warning: No catalog specified for inference. Use --infer-assembly=<path> or --append-to");
905 }
906 None
907 }
908 }
909 None => None,
910 };
911
912 if let Some(ref catalog) = infer_catalog {
913 if let Some(inferred) = catalog.infer_base_assembly_default(&dist.contigs) {
914 if verbose {
915 eprintln!(
916 "Inferred base assembly: {} {} ({:.1}% match, {}/{} contigs)",
917 inferred.assembly_name,
918 inferred.version_string,
919 inferred.match_rate * 100.0,
920 inferred.matched_contigs,
921 inferred.total_input_contigs
922 );
923 }
924 (
925 assembly_id.clone().or(Some(inferred.assembly_id)),
926 version_id.clone().or(Some(inferred.version_id)),
927 )
928 } else {
929 if verbose {
930 eprintln!("Could not infer base assembly (no match above 90% threshold)");
931 }
932 (assembly_id.clone(), version_id.clone())
933 }
934 } else {
935 (assembly_id.clone(), version_id.clone())
936 }
937 } else {
938 (assembly_id.clone(), version_id.clone())
939 };
940
941 if let Some(append_path) = append_to {
943 let mut catalog = HierarchicalCatalog::load(&append_path)?;
945
946 if let (Some(asm_id), Some(ver_id)) = (&inferred_assembly_id, &inferred_version_id) {
948 let mut found = false;
949 for assembly in &mut catalog.assemblies {
950 if assembly.id == *asm_id {
951 for version in &mut assembly.versions {
952 if version.id == *ver_id {
953 if !force && version.fasta_distributions.iter().any(|d| d.id == id) {
955 anyhow::bail!(
956 "Distribution '{id}' already exists in version '{ver_id}'. Use --force to overwrite."
957 );
958 }
959
960 version.fasta_distributions.retain(|d| d.id != id);
962 version.fasta_distributions.push(dist.clone());
963 found = true;
964 break;
965 }
966 }
967 }
968 }
969 if !found {
970 anyhow::bail!("Assembly '{asm_id}' with version '{ver_id}' not found in catalog");
971 }
972 } else {
973 if !force && catalog.standalone_distributions.iter().any(|d| d.id == id) {
975 anyhow::bail!(
976 "Standalone distribution '{id}' already exists. Use --force to overwrite."
977 );
978 }
979 catalog.standalone_distributions.retain(|d| d.id != id);
980 catalog.standalone_distributions.push(dist.clone());
981 }
982
983 catalog.save(&append_path)?;
984 eprintln!("Added distribution '{}' to {}", id, append_path.display());
985 } else if let Some(out_path) = output {
986 if out_path.exists() && !force {
988 anyhow::bail!(
989 "Output file '{}' exists. Use --force to overwrite.",
990 out_path.display()
991 );
992 }
993
994 if let OutputFormat::Json = format {
996 let json = serde_json::to_string_pretty(&dist)?;
998 std::fs::write(&out_path, json)?;
999 eprintln!("Wrote distribution to {}", out_path.display());
1000 } else {
1001 let catalog = HierarchicalCatalog::new().with_standalone_distribution(dist);
1003 catalog.save(&out_path)?;
1004 eprintln!("Wrote hierarchical catalog to {}", out_path.display());
1005 }
1006 } else {
1007 match format {
1009 OutputFormat::Json => {
1010 println!("{}", serde_json::to_string_pretty(&dist)?);
1011 }
1012 OutputFormat::Text => {
1013 print_distribution_summary(&dist);
1014 }
1015 OutputFormat::Tsv => {
1016 println!("name\tlength\tmd5\treport_contig_id");
1017 for c in &dist.contigs {
1018 println!(
1019 "{}\t{}\t{}\t{}",
1020 c.name,
1021 c.length,
1022 c.md5,
1023 c.report_contig_id
1024 .map(|i| i.to_string())
1025 .unwrap_or_default()
1026 );
1027 }
1028 }
1029 }
1030 }
1031
1032 Ok(())
1033}
1034
1035fn print_distribution_summary(dist: &crate::core::assembly::FastaDistribution) {
1036 println!("Distribution: {} ({})", dist.display_name, dist.id);
1037 println!("Source: {:?}", dist.source);
1038 if let Some(url) = &dist.download_url {
1039 println!("Download URL: {url}");
1040 }
1041 if !dist.tags.is_empty() {
1042 println!("Tags: {}", dist.tags.join(", "));
1043 }
1044 println!("Contigs: {}", dist.contigs.len());
1045
1046 let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
1047 println!("With MD5: {md5_count}");
1048
1049 let linked = dist
1050 .contigs
1051 .iter()
1052 .filter(|c| c.report_contig_id.is_some())
1053 .count();
1054 println!("Linked to report: {linked}");
1055
1056 let counts = dist.presence_counts();
1058 if counts.in_both > 0 || counts.fasta_only > 0 {
1059 println!(
1060 "Presence: {} in-both, {} fasta-only",
1061 counts.in_both, counts.fasta_only
1062 );
1063 }
1064}
1065
1066fn parse_reference_source(s: &str) -> ReferenceSource {
1067 match s.to_lowercase().as_str() {
1068 "ucsc" => ReferenceSource::Ucsc,
1069 "ncbi" => ReferenceSource::Ncbi,
1070 "broad" => ReferenceSource::Broad,
1071 "ensembl" => ReferenceSource::Ensembl,
1072 "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1073 "dragen" | "illumina" => ReferenceSource::Illumina,
1074 _ => ReferenceSource::Custom(s.to_string()),
1075 }
1076}
1077
1078fn truncate(s: &str, max_len: usize) -> String {
1079 if s.len() <= max_len {
1080 s.to_string()
1081 } else {
1082 format!("{}...", &s[..max_len - 3])
1083 }
1084}
1085
1086#[allow(
1087 clippy::too_many_arguments,
1088 clippy::needless_pass_by_value,
1089 clippy::too_many_lines,
1090 clippy::fn_params_excessive_bools
1091)] fn run_build(
1093 id: String,
1094 name: String,
1095 inputs: Vec<PathBuf>,
1096 assembly: Option<String>,
1097 source: Option<String>,
1098 description: Option<String>,
1099 download_url: Option<String>,
1100 assembly_report_url: Option<String>,
1101 tags: Option<String>,
1102 output: Option<PathBuf>,
1103 append_to: Option<PathBuf>,
1104 force: bool,
1105 input_format: Option<InputFormatArg>,
1106 require_md5: bool,
1107 generate_ucsc_names: bool,
1108 format: OutputFormat,
1109 verbose: bool,
1110) -> anyhow::Result<()> {
1111 let assembly = assembly.map(|s| parse_assembly(&s));
1113
1114 let source = source.map(|s| parse_source(&s));
1116
1117 let tags: Vec<String> = tags
1119 .map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
1120 .unwrap_or_default();
1121
1122 let mut builder = ReferenceBuilder::new(&id, &name).generate_ucsc_names(generate_ucsc_names);
1124
1125 if let Some(assembly) = assembly {
1126 builder = builder.assembly(assembly);
1127 }
1128 if let Some(source) = source {
1129 builder = builder.source(source);
1130 }
1131 if let Some(desc) = description {
1132 builder = builder.description(desc);
1133 }
1134 if let Some(url) = download_url {
1135 builder = builder.download_url(url);
1136 }
1137 if let Some(url) = assembly_report_url {
1138 builder = builder.assembly_report_url(url);
1139 }
1140 if !tags.is_empty() {
1141 builder = builder.tags(tags);
1142 }
1143
1144 for input_path in &inputs {
1146 if !input_path.exists() {
1147 anyhow::bail!("Input file not found: {}", input_path.display());
1148 }
1149
1150 if verbose {
1151 eprintln!("Processing: {}", input_path.display());
1152 }
1153
1154 if let Some(fmt) = input_format {
1155 builder.add_input_with_format(input_path, fmt.into())?;
1156 } else {
1157 builder.add_input(input_path)?;
1158 }
1159 }
1160
1161 let summary = builder.summary();
1163
1164 if !summary.conflicts.is_empty() {
1166 eprintln!("Build failed due to conflicts:");
1167 for conflict in &summary.conflicts {
1168 eprintln!(" - {conflict}");
1169 }
1170 anyhow::bail!(
1171 "Build failed: {} conflict(s) detected",
1172 summary.conflicts.len()
1173 );
1174 }
1175
1176 if require_md5 && summary.with_md5 < summary.total_contigs {
1178 anyhow::bail!(
1179 "Build failed: --require-md5 specified but only {}/{} contigs have MD5",
1180 summary.with_md5,
1181 summary.total_contigs
1182 );
1183 }
1184
1185 let reference = builder.build()?;
1187
1188 if verbose || matches!(format, OutputFormat::Text) {
1190 eprintln!("{summary}");
1191 }
1192
1193 if let Some(catalog_path) = append_to {
1195 let mut catalog = if catalog_path.exists() {
1197 ReferenceCatalog::load_from_file(&catalog_path)?
1198 } else {
1199 ReferenceCatalog::new()
1200 };
1201
1202 let ref_id = crate::core::types::ReferenceId::new(&id);
1204 if catalog.get(&ref_id).is_some() {
1205 if force {
1206 eprintln!("Warning: Overwriting existing reference '{id}'");
1207 let refs: Vec<_> = catalog
1209 .references
1210 .into_iter()
1211 .filter(|r| r.id != ref_id)
1212 .collect();
1213 catalog = ReferenceCatalog::new();
1214 for r in refs {
1215 catalog.add_reference(r);
1216 }
1217 } else {
1218 anyhow::bail!(
1219 "Reference '{id}' already exists in catalog. Use --force to overwrite."
1220 );
1221 }
1222 }
1223
1224 catalog.add_reference(reference);
1225 let json = catalog.to_json()?;
1226 std::fs::write(&catalog_path, json)?;
1227
1228 println!(
1229 "Added reference '{}' to {} ({} total references)",
1230 id,
1231 catalog_path.display(),
1232 catalog.len()
1233 );
1234 } else if let Some(output_path) = output {
1235 let json = serde_json::to_string_pretty(&reference)?;
1237 std::fs::write(&output_path, &json)?;
1238 println!("Wrote reference '{}' to {}", id, output_path.display());
1239 } else {
1240 match format {
1242 OutputFormat::Json => {
1243 println!("{}", serde_json::to_string_pretty(&reference)?);
1244 }
1245 OutputFormat::Text | OutputFormat::Tsv => {
1246 println!("Reference: {}", reference.display_name);
1248 println!("ID: {}", reference.id);
1249 println!("Assembly: {}", reference.assembly);
1250 println!("Source: {}", reference.source);
1251 println!("Contigs: {}", reference.contigs.len());
1252 println!();
1253 println!("Use --output <file> to save as JSON");
1254 }
1255 }
1256 }
1257
1258 Ok(())
1259}
1260
1261fn parse_assembly(s: &str) -> Assembly {
1262 let lower = s.to_lowercase();
1263 match lower.as_str() {
1264 "grch37" | "hg19" | "b37" => Assembly::Grch37,
1265 "grch38" | "hg38" => Assembly::Grch38,
1266 _ => Assembly::Other(s.to_string()),
1267 }
1268}
1269
1270fn parse_source(s: &str) -> ReferenceSource {
1271 let lower = s.to_lowercase();
1272 match lower.as_str() {
1273 "ucsc" => ReferenceSource::Ucsc,
1274 "ncbi" | "grc" => ReferenceSource::Ncbi,
1275 "broad" => ReferenceSource::Broad,
1276 "ensembl" => ReferenceSource::Ensembl,
1277 "illumina" | "dragen" => ReferenceSource::Illumina,
1278 "1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
1279 _ => ReferenceSource::Custom(s.to_string()),
1280 }
1281}