use std::path::PathBuf;
use clap::{Args, Subcommand};
use crate::catalog::builder::{InputFormat, ReferenceBuilder};
use crate::catalog::hierarchical::HierarchicalCatalog;
use crate::catalog::store::ReferenceCatalog;
use crate::cli::OutputFormat;
use crate::core::types::{Assembly, ReferenceSource};
#[inline]
fn count_to_f64(count: usize) -> f64 {
#[allow(clippy::cast_precision_loss)]
{
count as f64
}
}
#[derive(Args)]
pub struct CatalogArgs {
#[command(subcommand)]
pub command: CatalogCommands,
}
#[derive(Subcommand)]
#[non_exhaustive]
pub enum CatalogCommands {
List {
#[arg(long)]
catalog: Option<PathBuf>,
#[arg(long)]
assembly: Option<String>,
#[arg(long)]
source: Option<String>,
},
Show {
#[arg(required = true)]
id: String,
#[arg(long)]
catalog: Option<PathBuf>,
#[arg(long)]
all_contigs: bool,
},
Export {
#[arg(required = true)]
output: PathBuf,
#[arg(long)]
catalog: Option<PathBuf>,
},
ListHierarchical {
#[arg(required = true)]
catalog: PathBuf,
},
BuildHierarchical {
#[arg(long, required = true)]
id: String,
#[arg(long, required = true)]
name: String,
#[arg(short, long = "input", required = true, num_args = 1..)]
inputs: Vec<PathBuf>,
#[arg(long)]
assembly_id: Option<String>,
#[arg(long)]
version_id: Option<String>,
#[arg(long)]
source: Option<String>,
#[arg(long)]
download_url: Option<String>,
#[arg(long)]
tags: Option<String>,
#[arg(short, long)]
output: Option<PathBuf>,
#[arg(long)]
append_to: Option<PathBuf>,
#[arg(long)]
force: bool,
#[arg(long)]
require_md5: bool,
#[arg(long)]
#[allow(clippy::option_option)]
infer_assembly: Option<Option<PathBuf>>,
#[arg(long)]
no_generate_ucsc_names: bool,
},
Build {
#[arg(long, required = true)]
id: String,
#[arg(long, required = true)]
name: String,
#[arg(short, long = "input", required = true, num_args = 1..)]
inputs: Vec<PathBuf>,
#[arg(long)]
assembly: Option<String>,
#[arg(long)]
source: Option<String>,
#[arg(long)]
description: Option<String>,
#[arg(long)]
download_url: Option<String>,
#[arg(long)]
assembly_report_url: Option<String>,
#[arg(long)]
tags: Option<String>,
#[arg(long)]
species: Option<String>,
#[arg(short, long)]
output: Option<PathBuf>,
#[arg(long)]
append_to: Option<PathBuf>,
#[arg(long)]
force: bool,
#[arg(long, value_enum)]
input_format: Option<InputFormatArg>,
#[arg(long)]
require_md5: bool,
#[arg(long)]
no_generate_ucsc_names: bool,
},
}
#[derive(Clone, Copy, Debug, clap::ValueEnum)]
pub enum InputFormatArg {
Dict,
Fai,
Fasta,
NcbiReport,
Sam,
Bam,
Cram,
Vcf,
Tsv,
}
impl From<InputFormatArg> for InputFormat {
fn from(arg: InputFormatArg) -> Self {
match arg {
InputFormatArg::Dict => InputFormat::Dict,
InputFormatArg::Fai => InputFormat::Fai,
InputFormatArg::Fasta => InputFormat::Fasta,
InputFormatArg::NcbiReport => InputFormat::NcbiReport,
InputFormatArg::Sam => InputFormat::Sam,
InputFormatArg::Bam => InputFormat::Bam,
InputFormatArg::Cram => InputFormat::Cram,
InputFormatArg::Vcf => InputFormat::Vcf,
InputFormatArg::Tsv => InputFormat::Tsv,
}
}
}
pub fn run(args: CatalogArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
match args.command {
CatalogCommands::List {
catalog,
assembly,
source,
} => run_list(
catalog,
assembly.as_deref(),
source.as_deref(),
format,
verbose,
),
CatalogCommands::Show {
id,
catalog,
all_contigs,
} => run_show(id, catalog, all_contigs, format),
CatalogCommands::Export { output, catalog } => run_export(output, catalog),
CatalogCommands::ListHierarchical { catalog } => {
run_list_hierarchical(catalog, format, verbose)
}
CatalogCommands::BuildHierarchical {
id,
name,
inputs,
assembly_id,
version_id,
source,
download_url,
tags,
output,
append_to,
force,
require_md5,
infer_assembly,
no_generate_ucsc_names,
} => run_build_hierarchical(
id,
name,
inputs,
assembly_id,
version_id,
source,
download_url,
tags,
output,
append_to,
force,
require_md5,
infer_assembly,
!no_generate_ucsc_names, format,
verbose,
),
CatalogCommands::Build {
id,
name,
inputs,
assembly,
source,
description,
download_url,
assembly_report_url,
tags,
species,
output,
append_to,
force,
input_format,
require_md5,
no_generate_ucsc_names,
} => run_build(
id,
name,
inputs,
assembly,
source,
description,
download_url,
assembly_report_url,
tags,
species,
output,
append_to,
force,
input_format,
require_md5,
!no_generate_ucsc_names, format,
verbose,
),
}
}
#[allow(clippy::too_many_lines)] fn run_list(
catalog_path: Option<PathBuf>,
assembly_filter: Option<&str>,
source_filter: Option<&str>,
format: OutputFormat,
verbose: bool,
) -> anyhow::Result<()> {
let catalog = if let Some(path) = catalog_path {
ReferenceCatalog::load_from_file(&path)?
} else {
ReferenceCatalog::load_embedded()?
};
if verbose {
eprintln!("Loaded catalog with {} references", catalog.len());
}
let filtered: Vec<_> = catalog
.references
.iter()
.filter(|r| {
if let Some(assembly) = &assembly_filter {
let ref_assembly = format!("{}", r.assembly).to_lowercase();
if !ref_assembly.contains(&assembly.to_lowercase()) {
return false;
}
}
if let Some(source) = &source_filter {
let ref_source = format!("{}", r.source).to_lowercase();
if !ref_source.contains(&source.to_lowercase()) {
return false;
}
}
true
})
.collect();
match format {
OutputFormat::Text => {
let id_width = filtered
.iter()
.map(|r| r.id.0.len())
.max()
.unwrap_or(2)
.max(2);
let name_width = filtered
.iter()
.map(|r| r.display_name.len().min(35))
.max()
.unwrap_or(4)
.max(4);
let assembly_width = filtered
.iter()
.map(|r| format!("{}", r.assembly).len())
.max()
.unwrap_or(8)
.max(8);
let source_width = filtered
.iter()
.map(|r| format!("{}", r.source).len())
.max()
.unwrap_or(6)
.max(6);
let total_width = id_width + name_width + assembly_width + source_width + 8 + 8;
println!("Reference Catalog ({} references)\n", filtered.len());
println!(
"{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
"ID",
"Name",
"Assembly",
"Source",
"Contigs",
id_w = id_width,
name_w = name_width,
asm_w = assembly_width,
src_w = source_width
);
println!("{}", "-".repeat(total_width));
for r in &filtered {
println!(
"{:<id_w$} {:<name_w$} {:<asm_w$} {:<src_w$} {:>8}",
r.id.0,
truncate(&r.display_name, name_width),
format!("{}", r.assembly),
format!("{}", r.source),
r.contigs.len(),
id_w = id_width,
name_w = name_width,
asm_w = assembly_width,
src_w = source_width
);
if verbose {
let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
let md5_pct = if r.contigs.is_empty() {
0.0
} else {
100.0 * count_to_f64(md5_count) / count_to_f64(r.contigs.len())
};
if let Some(url) = &r.download_url {
println!(
" └─ MD5: {}/{} ({:.0}%) URL: {}",
md5_count,
r.contigs.len(),
md5_pct,
url
);
} else {
println!(
" └─ MD5: {}/{} ({:.0}%)",
md5_count,
r.contigs.len(),
md5_pct
);
}
}
}
}
OutputFormat::Json => {
let output: Vec<serde_json::Value> = filtered
.iter()
.map(|r| {
let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
let role_counts = r.role_counts();
let mut json = serde_json::json!({
"id": r.id.0,
"display_name": r.display_name,
"assembly": format!("{}", r.assembly),
"source": format!("{}", r.source),
"contig_count": r.contigs.len(),
"md5_count": md5_count,
"has_decoy": r.has_decoy(),
"has_alt": r.has_alt(),
"fasta_url": r.download_url,
"assembly_report_url": r.assembly_report_url,
"role_counts": {
"assembled_molecule": role_counts.assembled_molecule,
"alt_scaffold": role_counts.alt_scaffold,
"fix_patch": role_counts.fix_patch,
"novel_patch": role_counts.novel_patch,
"unlocalized_scaffold": role_counts.unlocalized_scaffold,
"unplaced_scaffold": role_counts.unplaced_scaffold,
"unknown": role_counts.unknown,
},
"tags": r.tags,
});
if !r.contigs_missing_from_fasta.is_empty() {
json["contigs_missing_from_fasta"] =
serde_json::json!(&r.contigs_missing_from_fasta);
}
json
})
.collect();
println!("{}", serde_json::to_string_pretty(&output)?);
}
OutputFormat::Tsv => {
println!("id\tdisplay_name\tassembly\tsource\tcontig_count\tmd5_count\thas_decoy\thas_alt\tdownload_url");
for r in &filtered {
let md5_count = r.contigs.iter().filter(|c| c.md5.is_some()).count();
println!(
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
r.id.0,
r.display_name,
r.assembly,
r.source,
r.contigs.len(),
md5_count,
r.has_decoy(),
r.has_alt(),
r.download_url.as_deref().unwrap_or("")
);
}
}
}
Ok(())
}
#[allow(clippy::needless_pass_by_value)] fn run_show(
id: String,
catalog_path: Option<PathBuf>,
all_contigs: bool,
format: OutputFormat,
) -> anyhow::Result<()> {
let catalog = if let Some(path) = catalog_path {
ReferenceCatalog::load_from_file(&path)?
} else {
ReferenceCatalog::load_embedded()?
};
let ref_id = crate::core::types::ReferenceId::new(&id);
let reference = catalog
.get(&ref_id)
.ok_or_else(|| anyhow::anyhow!("Reference '{id}' not found"))?;
match format {
OutputFormat::Text => {
println!("Reference: {}\n", reference.display_name);
println!("ID: {}", reference.id);
println!("Assembly: {}", reference.assembly);
println!("Source: {}", reference.source);
println!("Naming: {:?}", reference.naming_convention);
println!("Contigs: {}", reference.contigs.len());
println!("Has Decoy: {}", reference.has_decoy());
println!("Has ALT: {}", reference.has_alt());
if let Some(desc) = &reference.description {
println!("\nDescription: {desc}");
}
if let Some(url) = &reference.download_url {
println!("\nDownload URL: {url}");
}
if !reference.tags.is_empty() {
println!("\nTags: {}", reference.tags.join(", "));
}
let contigs_to_show = if all_contigs {
&reference.contigs[..]
} else {
&reference.contigs[..reference.contigs.len().min(25)]
};
println!("\nContigs:");
println!("{:<25} {:>15} MD5", "Name", "Length");
println!("{}", "-".repeat(80));
for contig in contigs_to_show {
println!(
"{:<25} {:>15} {}",
contig.name,
contig.length,
contig.md5.as_deref().unwrap_or("-")
);
}
if !all_contigs && reference.contigs.len() > 25 {
println!(
"\n... and {} more contigs (use --all-contigs to show all)",
reference.contigs.len() - 25
);
}
}
OutputFormat::Json => {
println!("{}", serde_json::to_string_pretty(&reference)?);
}
OutputFormat::Tsv => {
println!("name\tlength\tmd5");
for contig in &reference.contigs {
println!(
"{}\t{}\t{}",
contig.name,
contig.length,
contig.md5.as_deref().unwrap_or("")
);
}
}
}
Ok(())
}
#[allow(clippy::needless_pass_by_value)] fn run_export(output: PathBuf, catalog_path: Option<PathBuf>) -> anyhow::Result<()> {
let catalog = if let Some(path) = catalog_path {
ReferenceCatalog::load_from_file(&path)?
} else {
ReferenceCatalog::load_embedded()?
};
let json = catalog.to_json()?;
std::fs::write(&output, json)?;
println!(
"Exported {} references to {}",
catalog.len(),
output.display()
);
Ok(())
}
#[allow(clippy::needless_pass_by_value, clippy::too_many_lines)] fn run_list_hierarchical(
catalog_path: PathBuf,
format: OutputFormat,
verbose: bool,
) -> anyhow::Result<()> {
let catalog = HierarchicalCatalog::load(&catalog_path)?;
if verbose {
eprintln!(
"Loaded hierarchical catalog v{} with {} assemblies",
catalog.version,
catalog.assemblies.len()
);
}
match format {
OutputFormat::Text => {
println!("Hierarchical Reference Catalog (v{})\n", catalog.version);
let mut total_versions = 0;
let mut total_distributions = 0;
let mut total_contigs = 0;
for assembly in &catalog.assemblies {
total_versions += assembly.versions.len();
for version in &assembly.versions {
total_distributions += version.fasta_distributions.len();
for dist in &version.fasta_distributions {
total_contigs += dist.contigs.len();
}
}
}
println!(
"Summary: {} assemblies, {} versions, {} distributions, {} total contigs\n",
catalog.assemblies.len(),
total_versions,
total_distributions,
total_contigs
);
for assembly in &catalog.assemblies {
println!("{} ({})", assembly.name, assembly.id);
println!(" Organism: {}", assembly.organism);
for version in &assembly.versions {
println!("\n Version: {} ({})", version.version, version.id);
match &version.source {
crate::core::assembly::ReportSource::Ncbi { accession, .. } => {
println!(" Source: NCBI ({accession})");
}
crate::core::assembly::ReportSource::DerivedFromFasta {
base_assembly,
..
} => {
if let Some(base) = base_assembly {
println!(" Source: Derived from FASTA (base: {base})");
} else {
println!(" Source: Derived from FASTA");
}
}
crate::core::assembly::ReportSource::Manual { .. } => {
println!(" Source: Manual");
}
}
if !version.report_contigs.is_empty() {
println!(" Report contigs: {}", version.report_contigs.len());
}
println!(" Distributions:");
for dist in &version.fasta_distributions {
let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
println!(
" - {} ({}): {} contigs, {} with MD5",
dist.display_name,
dist.id,
dist.contigs.len(),
md5_count
);
if verbose {
let counts = dist.presence_counts();
if counts.in_both > 0 || counts.fasta_only > 0 {
println!(
" Presence: {} in-both, {} fasta-only",
counts.in_both, counts.fasta_only
);
}
if let Some(url) = &dist.download_url {
println!(" URL: {url}");
}
}
}
}
println!();
}
if !catalog.standalone_distributions.is_empty() {
println!("Standalone Distributions:");
for dist in &catalog.standalone_distributions {
let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
println!(
" - {} ({}): {} contigs, {} with MD5",
dist.display_name,
dist.id,
dist.contigs.len(),
md5_count
);
}
}
}
OutputFormat::Json => {
println!("{}", serde_json::to_string_pretty(&catalog)?);
}
OutputFormat::Tsv => {
println!(
"assembly_id\tversion_id\tdistribution_id\tdisplay_name\tcontig_count\tmd5_count"
);
for assembly in &catalog.assemblies {
for version in &assembly.versions {
for dist in &version.fasta_distributions {
let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
println!(
"{}\t{}\t{}\t{}\t{}\t{}",
assembly.id,
version.id,
dist.id,
dist.display_name,
dist.contigs.len(),
md5_count
);
}
}
}
for dist in &catalog.standalone_distributions {
let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
println!(
"\t\t{}\t{}\t{}\t{}",
dist.id,
dist.display_name,
dist.contigs.len(),
md5_count
);
}
}
}
Ok(())
}
#[allow(
clippy::too_many_arguments,
clippy::needless_pass_by_value,
clippy::option_option,
clippy::too_many_lines,
clippy::fn_params_excessive_bools
)] fn run_build_hierarchical(
id: String,
name: String,
inputs: Vec<PathBuf>,
assembly_id: Option<String>,
version_id: Option<String>,
source: Option<String>,
download_url: Option<String>,
tags: Option<String>,
output: Option<PathBuf>,
append_to: Option<PathBuf>,
force: bool,
require_md5: bool,
infer_assembly: Option<Option<PathBuf>>,
generate_ucsc_names: bool,
format: OutputFormat,
verbose: bool,
) -> anyhow::Result<()> {
use crate::catalog::builder::DistributionBuilder;
let ref_source = source.map_or(ReferenceSource::Custom("custom".to_string()), |s| {
parse_reference_source(&s)
});
let tags: Vec<String> = tags
.map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
.unwrap_or_default();
let mut builder = DistributionBuilder::new(&id)
.with_display_name(&name)
.with_source(ref_source)
.with_generate_ucsc_names(generate_ucsc_names);
if let Some(url) = download_url {
builder = builder.with_download_url(url);
}
if !tags.is_empty() {
builder = builder.with_tags(tags);
}
for input_path in &inputs {
if !input_path.exists() {
anyhow::bail!("Input file not found: {}", input_path.display());
}
if verbose {
eprintln!("Processing: {}", input_path.display());
}
builder.add_input(input_path)?;
}
let dist = builder.build()?;
if require_md5 {
let missing_md5: Vec<_> = dist
.contigs
.iter()
.filter(|c| c.md5.is_empty())
.map(|c| c.name.as_str())
.collect();
if !missing_md5.is_empty() {
anyhow::bail!(
"MD5 required but {} contig(s) lack MD5: {}",
missing_md5.len(),
missing_md5.join(", ")
);
}
}
let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
if verbose {
eprintln!(
"Built distribution '{}' with {} contigs ({} with MD5)",
id,
dist.contigs.len(),
md5_count
);
}
let (inferred_assembly_id, inferred_version_id) = if infer_assembly.is_some() {
let infer_catalog = match &infer_assembly {
Some(Some(path)) => Some(HierarchicalCatalog::load(path)?),
Some(None) => {
if let Some(ref append_path) = append_to {
Some(HierarchicalCatalog::load(append_path)?)
} else {
if verbose {
eprintln!("Warning: No catalog specified for inference. Use --infer-assembly=<path> or --append-to");
}
None
}
}
None => None,
};
if let Some(ref catalog) = infer_catalog {
if let Some(inferred) = catalog.infer_base_assembly_default(&dist.contigs) {
if verbose {
eprintln!(
"Inferred base assembly: {} {} ({:.1}% match, {}/{} contigs)",
inferred.assembly_name,
inferred.version_string,
inferred.match_rate * 100.0,
inferred.matched_contigs,
inferred.total_input_contigs
);
}
(
assembly_id.clone().or(Some(inferred.assembly_id)),
version_id.clone().or(Some(inferred.version_id)),
)
} else {
if verbose {
eprintln!("Could not infer base assembly (no match above 90% threshold)");
}
(assembly_id.clone(), version_id.clone())
}
} else {
(assembly_id.clone(), version_id.clone())
}
} else {
(assembly_id.clone(), version_id.clone())
};
if let Some(append_path) = append_to {
let mut catalog = HierarchicalCatalog::load(&append_path)?;
if let (Some(asm_id), Some(ver_id)) = (&inferred_assembly_id, &inferred_version_id) {
let mut found = false;
for assembly in &mut catalog.assemblies {
if assembly.id == *asm_id {
for version in &mut assembly.versions {
if version.id == *ver_id {
if !force && version.fasta_distributions.iter().any(|d| d.id == id) {
anyhow::bail!(
"Distribution '{id}' already exists in version '{ver_id}'. Use --force to overwrite."
);
}
version.fasta_distributions.retain(|d| d.id != id);
version.fasta_distributions.push(dist.clone());
found = true;
break;
}
}
}
}
if !found {
anyhow::bail!("Assembly '{asm_id}' with version '{ver_id}' not found in catalog");
}
} else {
if !force && catalog.standalone_distributions.iter().any(|d| d.id == id) {
anyhow::bail!(
"Standalone distribution '{id}' already exists. Use --force to overwrite."
);
}
catalog.standalone_distributions.retain(|d| d.id != id);
catalog.standalone_distributions.push(dist.clone());
}
catalog.save(&append_path)?;
eprintln!("Added distribution '{}' to {}", id, append_path.display());
} else if let Some(out_path) = output {
if out_path.exists() && !force {
anyhow::bail!(
"Output file '{}' exists. Use --force to overwrite.",
out_path.display()
);
}
if let OutputFormat::Json = format {
let json = serde_json::to_string_pretty(&dist)?;
std::fs::write(&out_path, json)?;
eprintln!("Wrote distribution to {}", out_path.display());
} else {
let catalog = HierarchicalCatalog::new().with_standalone_distribution(dist);
catalog.save(&out_path)?;
eprintln!("Wrote hierarchical catalog to {}", out_path.display());
}
} else {
match format {
OutputFormat::Json => {
println!("{}", serde_json::to_string_pretty(&dist)?);
}
OutputFormat::Text => {
print_distribution_summary(&dist);
}
OutputFormat::Tsv => {
println!("name\tlength\tmd5\treport_contig_id");
for c in &dist.contigs {
println!(
"{}\t{}\t{}\t{}",
c.name,
c.length,
c.md5,
c.report_contig_id
.map(|i| i.to_string())
.unwrap_or_default()
);
}
}
}
}
Ok(())
}
fn print_distribution_summary(dist: &crate::core::assembly::FastaDistribution) {
println!("Distribution: {} ({})", dist.display_name, dist.id);
println!("Source: {:?}", dist.source);
if let Some(url) = &dist.download_url {
println!("Download URL: {url}");
}
if !dist.tags.is_empty() {
println!("Tags: {}", dist.tags.join(", "));
}
println!("Contigs: {}", dist.contigs.len());
let md5_count = dist.contigs.iter().filter(|c| !c.md5.is_empty()).count();
println!("With MD5: {md5_count}");
let linked = dist
.contigs
.iter()
.filter(|c| c.report_contig_id.is_some())
.count();
println!("Linked to report: {linked}");
let counts = dist.presence_counts();
if counts.in_both > 0 || counts.fasta_only > 0 {
println!(
"Presence: {} in-both, {} fasta-only",
counts.in_both, counts.fasta_only
);
}
}
fn parse_reference_source(s: &str) -> ReferenceSource {
match s.to_lowercase().as_str() {
"ucsc" => ReferenceSource::Ucsc,
"ncbi" => ReferenceSource::Ncbi,
"broad" => ReferenceSource::Broad,
"ensembl" => ReferenceSource::Ensembl,
"1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
"dragen" | "illumina" => ReferenceSource::Illumina,
_ => ReferenceSource::Custom(s.to_string()),
}
}
fn truncate(s: &str, max_len: usize) -> String {
if s.len() <= max_len {
s.to_string()
} else {
format!("{}...", &s[..max_len - 3])
}
}
#[allow(
clippy::too_many_arguments,
clippy::needless_pass_by_value,
clippy::too_many_lines,
clippy::fn_params_excessive_bools
)] fn run_build(
id: String,
name: String,
inputs: Vec<PathBuf>,
assembly: Option<String>,
source: Option<String>,
description: Option<String>,
download_url: Option<String>,
assembly_report_url: Option<String>,
tags: Option<String>,
species: Option<String>,
output: Option<PathBuf>,
append_to: Option<PathBuf>,
force: bool,
input_format: Option<InputFormatArg>,
require_md5: bool,
generate_ucsc_names: bool,
format: OutputFormat,
verbose: bool,
) -> anyhow::Result<()> {
let assembly = assembly.map(|s| parse_assembly(&s));
let source = source.map(|s| parse_source(&s));
let tags: Vec<String> = tags
.map(|s| s.split(',').map(|t| t.trim().to_string()).collect())
.unwrap_or_default();
let mut builder = ReferenceBuilder::new(&id, &name).generate_ucsc_names(generate_ucsc_names);
if let Some(assembly) = assembly {
builder = builder.assembly(assembly);
}
if let Some(source) = source {
builder = builder.source(source);
}
if let Some(desc) = description {
builder = builder.description(desc);
}
if let Some(url) = download_url {
builder = builder.download_url(url);
}
if let Some(url) = assembly_report_url {
builder = builder.assembly_report_url(url);
}
if !tags.is_empty() {
builder = builder.tags(tags);
}
if let Some(sp) = species {
builder = builder.species(sp);
}
for input_path in &inputs {
if !input_path.exists() {
anyhow::bail!("Input file not found: {}", input_path.display());
}
if verbose {
eprintln!("Processing: {}", input_path.display());
}
if let Some(fmt) = input_format {
builder.add_input_with_format(input_path, fmt.into())?;
} else {
builder.add_input(input_path)?;
}
}
let summary = builder.summary();
if !summary.conflicts.is_empty() {
eprintln!("Build failed due to conflicts:");
for conflict in &summary.conflicts {
eprintln!(" - {conflict}");
}
anyhow::bail!(
"Build failed: {} conflict(s) detected",
summary.conflicts.len()
);
}
if require_md5 && summary.with_md5 < summary.total_contigs {
anyhow::bail!(
"Build failed: --require-md5 specified but only {}/{} contigs have MD5",
summary.with_md5,
summary.total_contigs
);
}
let reference = builder.build()?;
if verbose || matches!(format, OutputFormat::Text) {
eprintln!("{summary}");
}
if let Some(catalog_path) = append_to {
let mut catalog = if catalog_path.exists() {
ReferenceCatalog::load_from_file(&catalog_path)?
} else {
ReferenceCatalog::new()
};
let ref_id = crate::core::types::ReferenceId::new(&id);
if catalog.get(&ref_id).is_some() {
if force {
eprintln!("Warning: Overwriting existing reference '{id}'");
let refs: Vec<_> = catalog
.references
.into_iter()
.filter(|r| r.id != ref_id)
.collect();
catalog = ReferenceCatalog::new();
for r in refs {
catalog.add_reference(r);
}
} else {
anyhow::bail!(
"Reference '{id}' already exists in catalog. Use --force to overwrite."
);
}
}
catalog.add_reference(reference);
let json = catalog.to_json()?;
std::fs::write(&catalog_path, json)?;
println!(
"Added reference '{}' to {} ({} total references)",
id,
catalog_path.display(),
catalog.len()
);
} else if let Some(output_path) = output {
let json = serde_json::to_string_pretty(&reference)?;
std::fs::write(&output_path, &json)?;
println!("Wrote reference '{}' to {}", id, output_path.display());
} else {
match format {
OutputFormat::Json => {
println!("{}", serde_json::to_string_pretty(&reference)?);
}
OutputFormat::Text | OutputFormat::Tsv => {
println!("Reference: {}", reference.display_name);
println!("ID: {}", reference.id);
println!("Assembly: {}", reference.assembly);
println!("Source: {}", reference.source);
println!("Contigs: {}", reference.contigs.len());
println!();
println!("Use --output <file> to save as JSON");
}
}
}
Ok(())
}
fn parse_assembly(s: &str) -> Assembly {
let lower = s.to_lowercase();
match lower.as_str() {
"grch37" | "hg19" | "b37" => Assembly::Grch37,
"grch38" | "hg38" => Assembly::Grch38,
_ => Assembly::Other(s.to_string()),
}
}
fn parse_source(s: &str) -> ReferenceSource {
let lower = s.to_lowercase();
match lower.as_str() {
"ucsc" => ReferenceSource::Ucsc,
"ncbi" | "grc" => ReferenceSource::Ncbi,
"broad" => ReferenceSource::Broad,
"ensembl" => ReferenceSource::Ensembl,
"illumina" | "dragen" => ReferenceSource::Illumina,
"1kg" | "1000genomes" => ReferenceSource::OneThousandGenomes,
_ => ReferenceSource::Custom(s.to_string()),
}
}