1use std::path::{Path, PathBuf};
2
3use clap::Args;
4
5use crate::catalog::hierarchical::HierarchicalCatalog;
6use crate::catalog::store::ReferenceCatalog;
7use crate::cli::OutputFormat;
8use crate::core::header::QueryHeader;
9use crate::core::types::Confidence;
10use crate::matching::engine::{MatchResult, MatchingConfig, MatchingEngine, ScoringWeights};
11use crate::matching::hierarchical_engine::{HierarchicalMatchResult, HierarchicalMatchingEngine};
12use crate::matching::Suggestion;
13use crate::parsing;
14
15#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
18pub enum MissingContigHandling {
19 #[default]
21 Warn,
22 Strict,
24 Silent,
26}
27
28#[derive(Args)]
29pub struct IdentifyArgs {
30 #[arg(required = true)]
33 pub input: PathBuf,
34
35 #[arg(long)]
37 pub input_format: Option<InputFormat>,
38
39 #[arg(short = 'n', long, default_value = "5")]
41 pub max_matches: usize,
42
43 #[arg(long)]
45 pub exact_only: bool,
46
47 #[arg(long)]
49 pub catalog: Option<PathBuf>,
50
51 #[arg(long)]
53 pub hierarchical: bool,
54
55 #[arg(long, value_enum, default_value = "warn")]
58 pub missing_contig_handling: MissingContigHandling,
59
60 #[arg(long, default_value = "70", value_parser = clap::value_parser!(u32).range(0..=100))]
64 pub weight_match: u32,
65
66 #[arg(long, default_value = "20", value_parser = clap::value_parser!(u32).range(0..=100))]
69 pub weight_coverage: u32,
70
71 #[arg(long, default_value = "10", value_parser = clap::value_parser!(u32).range(0..=100))]
74 pub weight_order: u32,
75}
76
77#[derive(Clone, Copy, Debug, clap::ValueEnum)]
78pub enum InputFormat {
79 Sam,
80 Bam,
81 Cram,
82 Dict,
83 Fai,
84 Fasta,
85 Vcf,
86 Tsv,
87 Csv,
88}
89
90#[allow(clippy::needless_pass_by_value)] pub fn run(args: IdentifyArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
97 let query = parse_input(&args)?;
99
100 if verbose {
101 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] let md5_pct = (query.md5_coverage() * 100.0) as u32;
103 eprintln!(
104 "Parsed {} contigs from input ({md5_pct}% have MD5)",
105 query.contigs.len(),
106 );
107 }
108
109 if args.hierarchical {
111 run_hierarchical(&args, &query, format, verbose)
112 } else {
113 run_flat(&args, &query, format, verbose)
114 }
115}
116
117fn run_flat(
118 args: &IdentifyArgs,
119 query: &QueryHeader,
120 format: OutputFormat,
121 verbose: bool,
122) -> anyhow::Result<()> {
123 let catalog = if let Some(path) = &args.catalog {
125 ReferenceCatalog::load_from_file(path)?
126 } else {
127 ReferenceCatalog::load_embedded()?
128 };
129
130 if verbose {
131 eprintln!("Loaded flat catalog with {} references", catalog.len());
132 }
133
134 if catalog.is_empty() {
135 eprintln!("Warning: Catalog is empty, no references to match against.");
136 return Ok(());
137 }
138
139 let scoring_weights = ScoringWeights {
141 contig_match: f64::from(args.weight_match) / 100.0,
142 coverage: f64::from(args.weight_coverage) / 100.0,
143 order: f64::from(args.weight_order) / 100.0,
144 conflict_penalty: 0.1, };
146
147 if verbose {
148 eprintln!(
149 "Scoring weights: {:.0}% match, {:.0}% coverage, {:.0}% order",
150 scoring_weights.contig_match * 100.0,
151 scoring_weights.coverage * 100.0,
152 scoring_weights.order * 100.0,
153 );
154 }
155
156 let config = MatchingConfig {
158 min_score: 0.1,
159 scoring_weights: scoring_weights.clone(),
160 };
161 let engine = MatchingEngine::new(&catalog, config);
162 let matches = engine.find_matches(query, args.max_matches);
163
164 if matches.is_empty() {
165 eprintln!("No matching references found.");
166 return Ok(());
167 }
168
169 match format {
171 OutputFormat::Text => {
172 print_text_results(
173 &matches,
174 query,
175 verbose,
176 args.missing_contig_handling,
177 &scoring_weights,
178 );
179 }
180 OutputFormat::Json => {
181 print_json_results(&matches, args.missing_contig_handling, &scoring_weights)?;
182 }
183 OutputFormat::Tsv => print_tsv_results(&matches, &scoring_weights),
184 }
185
186 Ok(())
187}
188
189fn run_hierarchical(
190 args: &IdentifyArgs,
191 query: &QueryHeader,
192 format: OutputFormat,
193 verbose: bool,
194) -> anyhow::Result<()> {
195 let catalog_path = args
197 .catalog
198 .as_ref()
199 .ok_or_else(|| anyhow::anyhow!("--catalog is required when using --hierarchical"))?;
200
201 let catalog = HierarchicalCatalog::load(catalog_path)?;
202
203 if verbose {
204 eprintln!(
205 "Loaded hierarchical catalog v{} with {} assemblies",
206 catalog.version,
207 catalog.assemblies.len()
208 );
209 }
210
211 let engine = HierarchicalMatchingEngine::new(&catalog);
213 let matches = engine.find_matches(query, args.max_matches);
214
215 if matches.is_empty() {
216 eprintln!("No matching references found.");
217 return Ok(());
218 }
219
220 match format {
222 OutputFormat::Text => print_hierarchical_text_results(&matches, query, verbose),
223 OutputFormat::Json => print_hierarchical_json_results(&matches)?,
224 OutputFormat::Tsv => print_hierarchical_tsv_results(&matches),
225 }
226
227 Ok(())
228}
229
230fn parse_input(args: &IdentifyArgs) -> anyhow::Result<QueryHeader> {
231 use std::io::{self, Read};
232
233 if args.input.to_string_lossy() == "-" {
235 let mut buffer = String::new();
236 io::stdin().read_to_string(&mut buffer)?;
237 return Ok(parsing::sam::parse_header_text(&buffer)?);
238 }
239
240 let format = args
242 .input_format
243 .unwrap_or_else(|| detect_format(&args.input));
244
245 match format {
246 InputFormat::Sam | InputFormat::Bam | InputFormat::Cram => {
247 Ok(parsing::sam::parse_file(&args.input)?)
248 }
249 InputFormat::Dict => Ok(parsing::dict::parse_dict_file(&args.input)?),
250 InputFormat::Fai => Ok(parsing::fai::parse_fai_file(&args.input)?),
251 InputFormat::Fasta => Ok(parsing::fasta::parse_fasta_file(&args.input)?),
252 InputFormat::Vcf => Ok(parsing::vcf::parse_vcf_file(&args.input)?),
253 InputFormat::Tsv => Ok(parsing::tsv::parse_tsv_file(&args.input, '\t')?),
254 InputFormat::Csv => Ok(parsing::tsv::parse_tsv_file(&args.input, ',')?),
255 }
256}
257
258fn detect_format(path: &Path) -> InputFormat {
260 let path_str = path.to_string_lossy().to_lowercase();
261
262 if parsing::fasta::is_fasta_file(path) {
264 return InputFormat::Fasta;
265 }
266
267 if path_str.ends_with(".vcf.gz") || path_str.ends_with(".vcf.bgz") {
269 return InputFormat::Vcf;
270 }
271
272 let ext = path
274 .extension()
275 .and_then(|e| e.to_str())
276 .map(str::to_lowercase);
277
278 match ext.as_deref() {
279 Some("bam") => InputFormat::Bam,
280 Some("cram") => InputFormat::Cram,
281 Some("dict") => InputFormat::Dict,
282 Some("fai") => InputFormat::Fai,
283 Some("vcf") => InputFormat::Vcf,
284 Some("tsv") => InputFormat::Tsv,
285 Some("csv") => InputFormat::Csv,
286 _ => InputFormat::Sam, }
288}
289
290#[allow(clippy::too_many_lines)] fn print_text_results(
292 matches: &[MatchResult],
293 query: &QueryHeader,
294 verbose: bool,
295 missing_handling: MissingContigHandling,
296 weights: &ScoringWeights,
297) {
298 for (i, result) in matches.iter().enumerate() {
299 if i > 0 {
300 println!("\n{}", "─".repeat(60));
301 }
302
303 let confidence_str = match result.score.confidence {
305 Confidence::Exact => "EXACT",
306 Confidence::High => "HIGH",
307 Confidence::Medium => "MEDIUM",
308 Confidence::Low => "LOW",
309 };
310
311 println!(
312 "\n#{} {} ({})",
313 i + 1,
314 result.reference.display_name,
315 confidence_str
316 );
317 println!(" ID: {}", result.reference.id);
318 println!(" Assembly: {}", result.reference.assembly);
319 println!(" Source: {}", result.reference.source);
320 println!(" Match Type: {:?}", result.diagnosis.match_type);
321
322 let norm = weights.normalized();
325 println!(
326 "\n Score: {:.1}% = {:.0}%×match + {:.0}%×coverage + {:.0}%×order",
327 result.score.composite * 100.0,
328 result.score.match_quality * 100.0,
329 result.score.coverage_score * 100.0,
330 result.score.order_score * 100.0,
331 );
332 println!(
333 " (weights: {:.0}% match, {:.0}% coverage, {:.0}% order)",
334 norm.contig_match * 100.0,
335 norm.coverage * 100.0,
336 norm.order * 100.0,
337 );
338
339 if !result.reference.contigs_missing_from_fasta.is_empty() {
341 let missing_set: std::collections::HashSet<String> = result
343 .reference
344 .contigs_missing_from_fasta
345 .iter()
346 .map(|s| s.to_lowercase())
347 .collect();
348
349 let query_has_missing: Vec<&str> = query
350 .contigs
351 .iter()
352 .filter(|c| missing_set.contains(&c.name.to_lowercase()))
353 .map(|c| c.name.as_str())
354 .collect();
355
356 match missing_handling {
357 MissingContigHandling::Silent => {}
358 MissingContigHandling::Warn => {
359 if !query_has_missing.is_empty() {
360 println!(
361 "\n Warning: Query has contig(s) not in reference FASTA: {}",
362 query_has_missing.join(", ")
363 );
364 println!(
365 " Note: {} uses external sequence(s) for: {}",
366 result.reference.display_name,
367 result.reference.contigs_missing_from_fasta.join(", ")
368 );
369 }
370 }
371 MissingContigHandling::Strict => {
372 if !query_has_missing.is_empty() {
373 println!(
374 "\n ERROR: Query has contig(s) not in reference FASTA: {}",
375 query_has_missing.join(", ")
376 );
377 println!(
378 " The reference {} does not include: {}",
379 result.reference.display_name,
380 result.reference.contigs_missing_from_fasta.join(", ")
381 );
382 }
383 }
384 }
385 }
386
387 let total_query = query.contigs.len();
389 let exact = result.score.exact_matches;
390 let name_len = result.score.name_length_matches;
391 let conflicts = result.score.md5_conflicts;
392 let unmatched = result.score.unmatched;
393
394 println!(
395 "\n Query contigs: {total_query} total → {exact} exact, {name_len} name+length, {conflicts} conflicts, {unmatched} unmatched"
396 );
397
398 let total_ref = result.reference.contigs.len();
400 let matched_ref = exact + name_len; let uncovered_ref = total_ref.saturating_sub(matched_ref);
402 println!(
403 " Reference contigs: {total_ref} total, {matched_ref} matched, {uncovered_ref} not in query"
404 );
405
406 if result.diagnosis.reordered {
407 println!(" Order: DIFFERENT from reference");
408 }
409
410 if !result.diagnosis.conflicts.is_empty() {
412 println!("\n Conflicts:");
413 for conflict in &result.diagnosis.conflicts {
414 println!(" - {}", conflict.description);
415 }
416 }
417
418 if !result.diagnosis.suggestions.is_empty() {
420 println!("\n Suggestions:");
421 for suggestion in &result.diagnosis.suggestions {
422 match suggestion {
423 Suggestion::RenameContigs { command_hint, .. } => {
424 println!(" - Rename contigs:");
425 for line in command_hint.lines() {
426 println!(" {line}");
427 }
428 }
429 Suggestion::ReorderContigs { command_hint } => {
430 println!(" - Reorder contigs:");
431 for line in command_hint.lines() {
432 println!(" {line}");
433 }
434 }
435 Suggestion::ReplaceContig {
436 contig_name,
437 reason,
438 ..
439 } => {
440 println!(" - Replace {contig_name}: {reason}");
441 }
442 Suggestion::UseAsIs { warnings } => {
443 if warnings.is_empty() {
444 println!(" - Safe to use as-is");
445 } else {
446 println!(" - Safe to use with warnings:");
447 for w in warnings {
448 println!(" - {w}");
449 }
450 }
451 }
452 Suggestion::Realign {
453 reason,
454 suggested_reference,
455 } => {
456 println!(" - Realignment needed: {reason}");
457 println!(" Suggested reference: {suggested_reference}");
458 }
459 }
460 }
461 }
462
463 if let Some(url) = &result.reference.download_url {
465 println!("\n Download: {url}");
466 }
467
468 if verbose && !result.diagnosis.renamed_matches.is_empty() {
470 println!("\n Rename mappings:");
471 for r in &result.diagnosis.renamed_matches {
472 println!(" {} -> {}", r.query_name, r.reference_name);
473 }
474 }
475 }
476
477 println!();
478}
479
480fn print_json_results(
481 matches: &[MatchResult],
482 missing_handling: MissingContigHandling,
483 weights: &ScoringWeights,
484) -> anyhow::Result<()> {
485 let norm = weights.normalized();
486 let output: Vec<serde_json::Value> = matches
488 .iter()
489 .map(|m| {
490 let ref_total = m.reference.contigs.len();
492 let ref_matched = m.score.exact_matches + m.score.name_length_matches;
493 let ref_uncovered = ref_total.saturating_sub(ref_matched);
494
495 let mut json = serde_json::json!({
496 "reference": {
497 "id": m.reference.id.0,
498 "display_name": m.reference.display_name,
499 "assembly": format!("{}", m.reference.assembly),
500 "source": format!("{}", m.reference.source),
501 "download_url": m.reference.download_url,
502 "total_contigs": ref_total,
503 },
504 "score": {
505 "composite": m.score.composite,
506 "confidence": format!("{:?}", m.score.confidence),
507 "match_quality": m.score.match_quality,
509 "coverage_score": m.score.coverage_score,
510 "order_score": m.score.order_score,
511 "weights": {
513 "match": norm.contig_match,
514 "coverage": norm.coverage,
515 "order": norm.order,
516 },
517 },
518 "query_contigs": {
519 "exact_matches": m.score.exact_matches,
520 "name_length_matches": m.score.name_length_matches,
521 "md5_conflicts": m.score.md5_conflicts,
522 "unmatched": m.score.unmatched,
523 },
524 "reference_coverage": {
525 "total": ref_total,
526 "matched": ref_matched,
527 "not_in_query": ref_uncovered,
528 },
529 "match_type": format!("{:?}", m.diagnosis.match_type),
530 "reordered": m.diagnosis.reordered,
531 });
532
533 if !matches!(missing_handling, MissingContigHandling::Silent)
535 && !m.reference.contigs_missing_from_fasta.is_empty()
536 {
537 json["reference"]["contigs_missing_from_fasta"] =
538 serde_json::json!(&m.reference.contigs_missing_from_fasta);
539 }
540
541 json
542 })
543 .collect();
544
545 println!("{}", serde_json::to_string_pretty(&output)?);
546 Ok(())
547}
548
549fn print_tsv_results(matches: &[MatchResult], weights: &ScoringWeights) {
550 let norm = weights.normalized();
551 println!(
553 "rank\tid\tdisplay_name\tassembly\tsource\tmatch_type\tscore\tmatch_score\tcoverage_score\torder_score\tweight_match\tweight_coverage\tweight_order\tconfidence\texact\tname_length\tconflicts\tunmatched\tref_total\tref_matched\tref_uncovered"
554 );
555 for (i, m) in matches.iter().enumerate() {
556 let ref_total = m.reference.contigs.len();
557 let ref_matched = m.score.exact_matches + m.score.name_length_matches;
558 let ref_uncovered = ref_total.saturating_sub(ref_matched);
559
560 println!(
561 "{}\t{}\t{}\t{}\t{}\t{:?}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.2}\t{:.2}\t{:.2}\t{:?}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
562 i + 1,
563 m.reference.id,
564 m.reference.display_name,
565 m.reference.assembly,
566 m.reference.source,
567 m.diagnosis.match_type,
568 m.score.composite,
569 m.score.match_quality,
570 m.score.coverage_score,
571 m.score.order_score,
572 norm.contig_match,
573 norm.coverage,
574 norm.order,
575 m.score.confidence,
576 m.score.exact_matches,
577 m.score.name_length_matches,
578 m.score.md5_conflicts,
579 m.score.unmatched,
580 ref_total,
581 ref_matched,
582 ref_uncovered,
583 );
584 }
585}
586
587fn print_hierarchical_text_results(
592 matches: &[HierarchicalMatchResult],
593 query: &QueryHeader,
594 verbose: bool,
595) {
596 for (i, result) in matches.iter().enumerate() {
597 if i > 0 {
598 println!("\n{}", "─".repeat(60));
599 }
600
601 let match_str = format!("{:?}", result.match_type).to_uppercase();
603 println!("\n#{} {} ({})", i + 1, result.display_name, match_str);
604
605 println!(" Distribution ID: {}", result.distribution_id);
607
608 if !result.assembly_id.is_empty() {
610 println!(
611 " Assembly: {} ({})",
612 result.assembly_name, result.assembly_id
613 );
614 if !result.version_string.is_empty() {
615 println!(
616 " Version: {} ({})",
617 result.version_string, result.version_id
618 );
619 }
620 }
621
622 println!(" Score: {:.1}%", result.match_percentage());
624
625 println!("\n Contig Summary:");
627 println!(" - Your file: {} contigs", result.total_query_contigs);
628 println!(
629 " - This distribution: {} contigs",
630 result.total_distribution_contigs
631 );
632 println!(" - Matched: {} contigs", result.matched_contigs);
633
634 if result.extra_in_query > 0 {
635 println!(" - Extra in your file: {}", result.extra_in_query);
636 }
637 if result.missing_from_query > 0 {
638 println!(" - Missing from your file: {}", result.missing_from_query);
639 }
640
641 let counts = &result.presence_counts;
643 if counts.in_both > 0 || counts.fasta_only > 0 || counts.report_only > 0 {
644 println!("\n Presence Breakdown:");
645 if counts.in_both > 0 {
646 println!(" - In both (FASTA + report): {} contigs", counts.in_both);
647 }
648 if counts.fasta_only > 0 {
649 println!(" - FASTA-only (decoy/HLA): {} contigs", counts.fasta_only);
650 }
651 if counts.report_only > 0 {
652 println!(
653 " - Report-only (not in FASTA): {} contigs",
654 counts.report_only
655 );
656 }
657 }
658
659 if verbose {
661 println!("\n Query contigs: {}", query.contigs.len());
662 let md5_count = query.contigs.iter().filter(|c| c.md5.is_some()).count();
663 println!(" Query contigs with MD5: {md5_count}");
664 }
665 }
666
667 println!();
668}
669
670fn print_hierarchical_json_results(matches: &[HierarchicalMatchResult]) -> anyhow::Result<()> {
671 let output: Vec<serde_json::Value> = matches
672 .iter()
673 .map(|m| {
674 serde_json::json!({
675 "distribution": {
676 "id": m.distribution_id,
677 "display_name": m.display_name,
678 },
679 "assembly": {
680 "id": m.assembly_id,
681 "name": m.assembly_name,
682 "version_id": m.version_id,
683 "version": m.version_string,
684 },
685 "match_type": format!("{:?}", m.match_type),
686 "score": m.score,
687 "matched_contigs": m.matched_contigs,
688 "total_query_contigs": m.total_query_contigs,
689 "total_distribution_contigs": m.total_distribution_contigs,
690 "extra_in_query": m.extra_in_query,
691 "missing_from_query": m.missing_from_query,
692 "presence_counts": {
693 "in_both": m.presence_counts.in_both,
694 "fasta_only": m.presence_counts.fasta_only,
695 "report_only": m.presence_counts.report_only,
696 },
697 })
698 })
699 .collect();
700
701 println!("{}", serde_json::to_string_pretty(&output)?);
702 Ok(())
703}
704
705fn print_hierarchical_tsv_results(matches: &[HierarchicalMatchResult]) {
706 println!("rank\tdistribution_id\tdisplay_name\tassembly_id\tversion_id\tmatch_type\tscore\tmatched\tquery_total\tdist_total\tin_both\tfasta_only");
707 for (i, m) in matches.iter().enumerate() {
708 println!(
709 "{}\t{}\t{}\t{}\t{}\t{:?}\t{:.4}\t{}\t{}\t{}\t{}\t{}",
710 i + 1,
711 m.distribution_id,
712 m.display_name,
713 m.assembly_id,
714 m.version_id,
715 m.match_type,
716 m.score,
717 m.matched_contigs,
718 m.total_query_contigs,
719 m.total_distribution_contigs,
720 m.presence_counts.in_both,
721 m.presence_counts.fasta_only,
722 );
723 }
724}