Skip to main content

ref_solver/cli/
identify.rs

1use std::path::{Path, PathBuf};
2
3use clap::Args;
4
5use crate::catalog::hierarchical::HierarchicalCatalog;
6use crate::catalog::store::ReferenceCatalog;
7use crate::cli::OutputFormat;
8use crate::core::header::QueryHeader;
9use crate::core::types::Confidence;
10use crate::matching::engine::{MatchResult, MatchingConfig, MatchingEngine, ScoringWeights};
11use crate::matching::hierarchical_engine::{HierarchicalMatchResult, HierarchicalMatchingEngine};
12use crate::matching::Suggestion;
13use crate::parsing;
14
15/// How to handle references that have contigs missing from their FASTA
16/// (e.g., CHM13 where MT is in assembly report but uses standard rCRS mitochondria)
17#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
18pub enum MissingContigHandling {
19    /// Show warnings when query has contigs that match a reference's missing contigs
20    #[default]
21    Warn,
22    /// Treat missing contigs as errors that lower match confidence
23    Strict,
24    /// Don't mention missing contigs at all
25    Silent,
26}
27
28#[derive(Args)]
29pub struct IdentifyArgs {
30    /// Input file (BAM, SAM, CRAM, FASTA, FAI, VCF, .dict, TSV, or CSV)
31    /// Use '-' for stdin (expects header text)
32    #[arg(required = true)]
33    pub input: PathBuf,
34
35    /// Input format (auto-detected by default)
36    #[arg(long)]
37    pub input_format: Option<InputFormat>,
38
39    /// Number of matches to show
40    #[arg(short = 'n', long, default_value = "5")]
41    pub max_matches: usize,
42
43    /// Only show exact or near-exact matches
44    #[arg(long)]
45    pub exact_only: bool,
46
47    /// Path to custom catalog file
48    #[arg(long)]
49    pub catalog: Option<PathBuf>,
50
51    /// Use hierarchical catalog format (required when --catalog points to a hierarchical catalog)
52    #[arg(long)]
53    pub hierarchical: bool,
54
55    /// How to handle references with contigs missing from FASTA
56    /// (e.g., CHM13 MT which uses standard rCRS mitochondria)
57    #[arg(long, value_enum, default_value = "warn")]
58    pub missing_contig_handling: MissingContigHandling,
59
60    // === Scoring weight options ===
61    /// Weight for contig match score (0-100, default 70)
62    /// How well query contigs match reference contigs
63    #[arg(long, default_value = "70", value_parser = clap::value_parser!(u32).range(0..=100))]
64    pub weight_match: u32,
65
66    /// Weight for coverage score (0-100, default 20)
67    /// What fraction of reference contigs are covered by query
68    #[arg(long, default_value = "20", value_parser = clap::value_parser!(u32).range(0..=100))]
69    pub weight_coverage: u32,
70
71    /// Weight for order score (0-100, default 10)
72    /// Whether contigs appear in the same order
73    #[arg(long, default_value = "10", value_parser = clap::value_parser!(u32).range(0..=100))]
74    pub weight_order: u32,
75}
76
77#[derive(Clone, Copy, Debug, clap::ValueEnum)]
78pub enum InputFormat {
79    Sam,
80    Bam,
81    Cram,
82    Dict,
83    Fai,
84    Fasta,
85    Vcf,
86    Tsv,
87    Csv,
88}
89
90/// Execute identify subcommand
91///
92/// # Errors
93///
94/// Returns an error if the input cannot be parsed or identification fails.
95#[allow(clippy::needless_pass_by_value)] // CLI entry point, values from clap
96pub fn run(args: IdentifyArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
97    // Parse input first (needed for both catalog types)
98    let query = parse_input(&args)?;
99
100    if verbose {
101        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] // Percentage 0-100
102        let md5_pct = (query.md5_coverage() * 100.0) as u32;
103        eprintln!(
104            "Parsed {} contigs from input ({md5_pct}% have MD5)",
105            query.contigs.len(),
106        );
107    }
108
109    // Use hierarchical or flat catalog based on flag
110    if args.hierarchical {
111        run_hierarchical(&args, &query, format, verbose)
112    } else {
113        run_flat(&args, &query, format, verbose)
114    }
115}
116
117fn run_flat(
118    args: &IdentifyArgs,
119    query: &QueryHeader,
120    format: OutputFormat,
121    verbose: bool,
122) -> anyhow::Result<()> {
123    // Load flat catalog
124    let catalog = if let Some(path) = &args.catalog {
125        ReferenceCatalog::load_from_file(path)?
126    } else {
127        ReferenceCatalog::load_embedded()?
128    };
129
130    if verbose {
131        eprintln!("Loaded flat catalog with {} references", catalog.len());
132    }
133
134    if catalog.is_empty() {
135        eprintln!("Warning: Catalog is empty, no references to match against.");
136        return Ok(());
137    }
138
139    // Build scoring weights from command line args
140    let scoring_weights = ScoringWeights {
141        contig_match: f64::from(args.weight_match) / 100.0,
142        coverage: f64::from(args.weight_coverage) / 100.0,
143        order: f64::from(args.weight_order) / 100.0,
144        conflict_penalty: 0.1, // Default: 10% credit for MD5 conflicts
145    };
146
147    if verbose {
148        eprintln!(
149            "Scoring weights: {:.0}% match, {:.0}% coverage, {:.0}% order",
150            scoring_weights.contig_match * 100.0,
151            scoring_weights.coverage * 100.0,
152            scoring_weights.order * 100.0,
153        );
154    }
155
156    // Find matches with custom config
157    let config = MatchingConfig {
158        min_score: 0.1,
159        scoring_weights: scoring_weights.clone(),
160    };
161    let engine = MatchingEngine::new(&catalog, config);
162    let matches = engine.find_matches(query, args.max_matches);
163
164    if matches.is_empty() {
165        eprintln!("No matching references found.");
166        return Ok(());
167    }
168
169    // Output results
170    match format {
171        OutputFormat::Text => {
172            print_text_results(
173                &matches,
174                query,
175                verbose,
176                args.missing_contig_handling,
177                &scoring_weights,
178            );
179        }
180        OutputFormat::Json => {
181            print_json_results(&matches, args.missing_contig_handling, &scoring_weights)?;
182        }
183        OutputFormat::Tsv => print_tsv_results(&matches, &scoring_weights),
184    }
185
186    Ok(())
187}
188
189fn run_hierarchical(
190    args: &IdentifyArgs,
191    query: &QueryHeader,
192    format: OutputFormat,
193    verbose: bool,
194) -> anyhow::Result<()> {
195    // Load hierarchical catalog
196    let catalog_path = args
197        .catalog
198        .as_ref()
199        .ok_or_else(|| anyhow::anyhow!("--catalog is required when using --hierarchical"))?;
200
201    let catalog = HierarchicalCatalog::load(catalog_path)?;
202
203    if verbose {
204        eprintln!(
205            "Loaded hierarchical catalog v{} with {} assemblies",
206            catalog.version,
207            catalog.assemblies.len()
208        );
209    }
210
211    // Find matches
212    let engine = HierarchicalMatchingEngine::new(&catalog);
213    let matches = engine.find_matches(query, args.max_matches);
214
215    if matches.is_empty() {
216        eprintln!("No matching references found.");
217        return Ok(());
218    }
219
220    // Output results
221    match format {
222        OutputFormat::Text => print_hierarchical_text_results(&matches, query, verbose),
223        OutputFormat::Json => print_hierarchical_json_results(&matches)?,
224        OutputFormat::Tsv => print_hierarchical_tsv_results(&matches),
225    }
226
227    Ok(())
228}
229
230fn parse_input(args: &IdentifyArgs) -> anyhow::Result<QueryHeader> {
231    use std::io::{self, Read};
232
233    // Handle stdin
234    if args.input.to_string_lossy() == "-" {
235        let mut buffer = String::new();
236        io::stdin().read_to_string(&mut buffer)?;
237        return Ok(parsing::sam::parse_header_text(&buffer)?);
238    }
239
240    // Auto-detect or use specified format
241    let format = args
242        .input_format
243        .unwrap_or_else(|| detect_format(&args.input));
244
245    match format {
246        InputFormat::Sam | InputFormat::Bam | InputFormat::Cram => {
247            Ok(parsing::sam::parse_file(&args.input)?)
248        }
249        InputFormat::Dict => Ok(parsing::dict::parse_dict_file(&args.input)?),
250        InputFormat::Fai => Ok(parsing::fai::parse_fai_file(&args.input)?),
251        InputFormat::Fasta => Ok(parsing::fasta::parse_fasta_file(&args.input)?),
252        InputFormat::Vcf => Ok(parsing::vcf::parse_vcf_file(&args.input)?),
253        InputFormat::Tsv => Ok(parsing::tsv::parse_tsv_file(&args.input, '\t')?),
254        InputFormat::Csv => Ok(parsing::tsv::parse_tsv_file(&args.input, ',')?),
255    }
256}
257
258/// Detect input format from file extension
259fn detect_format(path: &Path) -> InputFormat {
260    let path_str = path.to_string_lossy().to_lowercase();
261
262    // Check for FASTA files (including gzipped)
263    if parsing::fasta::is_fasta_file(path) {
264        return InputFormat::Fasta;
265    }
266
267    // Check for gzipped VCF
268    if path_str.ends_with(".vcf.gz") || path_str.ends_with(".vcf.bgz") {
269        return InputFormat::Vcf;
270    }
271
272    // Get the extension for simple cases
273    let ext = path
274        .extension()
275        .and_then(|e| e.to_str())
276        .map(str::to_lowercase);
277
278    match ext.as_deref() {
279        Some("bam") => InputFormat::Bam,
280        Some("cram") => InputFormat::Cram,
281        Some("dict") => InputFormat::Dict,
282        Some("fai") => InputFormat::Fai,
283        Some("vcf") => InputFormat::Vcf,
284        Some("tsv") => InputFormat::Tsv,
285        Some("csv") => InputFormat::Csv,
286        _ => InputFormat::Sam, // Default to SAM for unknown extensions
287    }
288}
289
290#[allow(clippy::too_many_lines)] // TODO: Refactor into smaller functions
291fn print_text_results(
292    matches: &[MatchResult],
293    query: &QueryHeader,
294    verbose: bool,
295    missing_handling: MissingContigHandling,
296    weights: &ScoringWeights,
297) {
298    for (i, result) in matches.iter().enumerate() {
299        if i > 0 {
300            println!("\n{}", "─".repeat(60));
301        }
302
303        // Header
304        let confidence_str = match result.score.confidence {
305            Confidence::Exact => "EXACT",
306            Confidence::High => "HIGH",
307            Confidence::Medium => "MEDIUM",
308            Confidence::Low => "LOW",
309        };
310
311        println!(
312            "\n#{} {} ({})",
313            i + 1,
314            result.reference.display_name,
315            confidence_str
316        );
317        println!("   ID: {}", result.reference.id);
318        println!("   Assembly: {}", result.reference.assembly);
319        println!("   Source: {}", result.reference.source);
320        println!("   Match Type: {:?}", result.diagnosis.match_type);
321
322        // Score breakdown: show component scores and final composite
323        // Normalize weights for display
324        let norm = weights.normalized();
325        println!(
326            "\n   Score: {:.1}% = {:.0}%×match + {:.0}%×coverage + {:.0}%×order",
327            result.score.composite * 100.0,
328            result.score.match_quality * 100.0,
329            result.score.coverage_score * 100.0,
330            result.score.order_score * 100.0,
331        );
332        println!(
333            "          (weights: {:.0}% match, {:.0}% coverage, {:.0}% order)",
334            norm.contig_match * 100.0,
335            norm.coverage * 100.0,
336            norm.order * 100.0,
337        );
338
339        // Check for contigs missing from FASTA
340        if !result.reference.contigs_missing_from_fasta.is_empty() {
341            // Find query contigs that match the missing contigs (by name, case-insensitive)
342            let missing_set: std::collections::HashSet<String> = result
343                .reference
344                .contigs_missing_from_fasta
345                .iter()
346                .map(|s| s.to_lowercase())
347                .collect();
348
349            let query_has_missing: Vec<&str> = query
350                .contigs
351                .iter()
352                .filter(|c| missing_set.contains(&c.name.to_lowercase()))
353                .map(|c| c.name.as_str())
354                .collect();
355
356            match missing_handling {
357                MissingContigHandling::Silent => {}
358                MissingContigHandling::Warn => {
359                    if !query_has_missing.is_empty() {
360                        println!(
361                            "\n   Warning: Query has contig(s) not in reference FASTA: {}",
362                            query_has_missing.join(", ")
363                        );
364                        println!(
365                            "   Note: {} uses external sequence(s) for: {}",
366                            result.reference.display_name,
367                            result.reference.contigs_missing_from_fasta.join(", ")
368                        );
369                    }
370                }
371                MissingContigHandling::Strict => {
372                    if !query_has_missing.is_empty() {
373                        println!(
374                            "\n   ERROR: Query has contig(s) not in reference FASTA: {}",
375                            query_has_missing.join(", ")
376                        );
377                        println!(
378                            "   The reference {} does not include: {}",
379                            result.reference.display_name,
380                            result.reference.contigs_missing_from_fasta.join(", ")
381                        );
382                    }
383                }
384            }
385        }
386
387        // Match details - query contigs
388        let total_query = query.contigs.len();
389        let exact = result.score.exact_matches;
390        let name_len = result.score.name_length_matches;
391        let conflicts = result.score.md5_conflicts;
392        let unmatched = result.score.unmatched;
393
394        println!(
395            "\n   Query contigs: {total_query} total → {exact} exact, {name_len} name+length, {conflicts} conflicts, {unmatched} unmatched"
396        );
397
398        // Reference coverage info
399        let total_ref = result.reference.contigs.len();
400        let matched_ref = exact + name_len; // Good matches that cover reference
401        let uncovered_ref = total_ref.saturating_sub(matched_ref);
402        println!(
403            "   Reference contigs: {total_ref} total, {matched_ref} matched, {uncovered_ref} not in query"
404        );
405
406        if result.diagnosis.reordered {
407            println!("   Order: DIFFERENT from reference");
408        }
409
410        // Conflicts
411        if !result.diagnosis.conflicts.is_empty() {
412            println!("\n   Conflicts:");
413            for conflict in &result.diagnosis.conflicts {
414                println!("   - {}", conflict.description);
415            }
416        }
417
418        // Suggestions
419        if !result.diagnosis.suggestions.is_empty() {
420            println!("\n   Suggestions:");
421            for suggestion in &result.diagnosis.suggestions {
422                match suggestion {
423                    Suggestion::RenameContigs { command_hint, .. } => {
424                        println!("   - Rename contigs:");
425                        for line in command_hint.lines() {
426                            println!("     {line}");
427                        }
428                    }
429                    Suggestion::ReorderContigs { command_hint } => {
430                        println!("   - Reorder contigs:");
431                        for line in command_hint.lines() {
432                            println!("     {line}");
433                        }
434                    }
435                    Suggestion::ReplaceContig {
436                        contig_name,
437                        reason,
438                        ..
439                    } => {
440                        println!("   - Replace {contig_name}: {reason}");
441                    }
442                    Suggestion::UseAsIs { warnings } => {
443                        if warnings.is_empty() {
444                            println!("   - Safe to use as-is");
445                        } else {
446                            println!("   - Safe to use with warnings:");
447                            for w in warnings {
448                                println!("     - {w}");
449                            }
450                        }
451                    }
452                    Suggestion::Realign {
453                        reason,
454                        suggested_reference,
455                    } => {
456                        println!("   - Realignment needed: {reason}");
457                        println!("     Suggested reference: {suggested_reference}");
458                    }
459                }
460            }
461        }
462
463        // Download URL
464        if let Some(url) = &result.reference.download_url {
465            println!("\n   Download: {url}");
466        }
467
468        // Verbose details
469        if verbose && !result.diagnosis.renamed_matches.is_empty() {
470            println!("\n   Rename mappings:");
471            for r in &result.diagnosis.renamed_matches {
472                println!("     {} -> {}", r.query_name, r.reference_name);
473            }
474        }
475    }
476
477    println!();
478}
479
480fn print_json_results(
481    matches: &[MatchResult],
482    missing_handling: MissingContigHandling,
483    weights: &ScoringWeights,
484) -> anyhow::Result<()> {
485    let norm = weights.normalized();
486    // Create serializable output
487    let output: Vec<serde_json::Value> = matches
488        .iter()
489        .map(|m| {
490            // Calculate reference coverage
491            let ref_total = m.reference.contigs.len();
492            let ref_matched = m.score.exact_matches + m.score.name_length_matches;
493            let ref_uncovered = ref_total.saturating_sub(ref_matched);
494
495            let mut json = serde_json::json!({
496                "reference": {
497                    "id": m.reference.id.0,
498                    "display_name": m.reference.display_name,
499                    "assembly": format!("{}", m.reference.assembly),
500                    "source": format!("{}", m.reference.source),
501                    "download_url": m.reference.download_url,
502                    "total_contigs": ref_total,
503                },
504                "score": {
505                    "composite": m.score.composite,
506                    "confidence": format!("{:?}", m.score.confidence),
507                    // Component scores (these make up the composite)
508                    "match_quality": m.score.match_quality,
509                    "coverage_score": m.score.coverage_score,
510                    "order_score": m.score.order_score,
511                    // Weights used
512                    "weights": {
513                        "match": norm.contig_match,
514                        "coverage": norm.coverage,
515                        "order": norm.order,
516                    },
517                },
518                "query_contigs": {
519                    "exact_matches": m.score.exact_matches,
520                    "name_length_matches": m.score.name_length_matches,
521                    "md5_conflicts": m.score.md5_conflicts,
522                    "unmatched": m.score.unmatched,
523                },
524                "reference_coverage": {
525                    "total": ref_total,
526                    "matched": ref_matched,
527                    "not_in_query": ref_uncovered,
528                },
529                "match_type": format!("{:?}", m.diagnosis.match_type),
530                "reordered": m.diagnosis.reordered,
531            });
532
533            // Add missing contig info unless silent
534            if !matches!(missing_handling, MissingContigHandling::Silent)
535                && !m.reference.contigs_missing_from_fasta.is_empty()
536            {
537                json["reference"]["contigs_missing_from_fasta"] =
538                    serde_json::json!(&m.reference.contigs_missing_from_fasta);
539            }
540
541            json
542        })
543        .collect();
544
545    println!("{}", serde_json::to_string_pretty(&output)?);
546    Ok(())
547}
548
549fn print_tsv_results(matches: &[MatchResult], weights: &ScoringWeights) {
550    let norm = weights.normalized();
551    // Header with all fields
552    println!(
553        "rank\tid\tdisplay_name\tassembly\tsource\tmatch_type\tscore\tmatch_score\tcoverage_score\torder_score\tweight_match\tweight_coverage\tweight_order\tconfidence\texact\tname_length\tconflicts\tunmatched\tref_total\tref_matched\tref_uncovered"
554    );
555    for (i, m) in matches.iter().enumerate() {
556        let ref_total = m.reference.contigs.len();
557        let ref_matched = m.score.exact_matches + m.score.name_length_matches;
558        let ref_uncovered = ref_total.saturating_sub(ref_matched);
559
560        println!(
561            "{}\t{}\t{}\t{}\t{}\t{:?}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.2}\t{:.2}\t{:.2}\t{:?}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
562            i + 1,
563            m.reference.id,
564            m.reference.display_name,
565            m.reference.assembly,
566            m.reference.source,
567            m.diagnosis.match_type,
568            m.score.composite,
569            m.score.match_quality,
570            m.score.coverage_score,
571            m.score.order_score,
572            norm.contig_match,
573            norm.coverage,
574            norm.order,
575            m.score.confidence,
576            m.score.exact_matches,
577            m.score.name_length_matches,
578            m.score.md5_conflicts,
579            m.score.unmatched,
580            ref_total,
581            ref_matched,
582            ref_uncovered,
583        );
584    }
585}
586
587// ============================================================================
588// Hierarchical catalog output functions
589// ============================================================================
590
591fn print_hierarchical_text_results(
592    matches: &[HierarchicalMatchResult],
593    query: &QueryHeader,
594    verbose: bool,
595) {
596    for (i, result) in matches.iter().enumerate() {
597        if i > 0 {
598            println!("\n{}", "─".repeat(60));
599        }
600
601        // Header with match type
602        let match_str = format!("{:?}", result.match_type).to_uppercase();
603        println!("\n#{} {} ({})", i + 1, result.display_name, match_str);
604
605        // Distribution info
606        println!("   Distribution ID: {}", result.distribution_id);
607
608        // Assembly info (if available)
609        if !result.assembly_id.is_empty() {
610            println!(
611                "   Assembly: {} ({})",
612                result.assembly_name, result.assembly_id
613            );
614            if !result.version_string.is_empty() {
615                println!(
616                    "   Version: {} ({})",
617                    result.version_string, result.version_id
618                );
619            }
620        }
621
622        // Match score
623        println!("   Score: {:.1}%", result.match_percentage());
624
625        // Contig summary
626        println!("\n   Contig Summary:");
627        println!("   - Your file: {} contigs", result.total_query_contigs);
628        println!(
629            "   - This distribution: {} contigs",
630            result.total_distribution_contigs
631        );
632        println!("   - Matched: {} contigs", result.matched_contigs);
633
634        if result.extra_in_query > 0 {
635            println!("   - Extra in your file: {}", result.extra_in_query);
636        }
637        if result.missing_from_query > 0 {
638            println!("   - Missing from your file: {}", result.missing_from_query);
639        }
640
641        // Presence breakdown (only show if there's assembly linkage)
642        let counts = &result.presence_counts;
643        if counts.in_both > 0 || counts.fasta_only > 0 || counts.report_only > 0 {
644            println!("\n   Presence Breakdown:");
645            if counts.in_both > 0 {
646                println!("   - In both (FASTA + report): {} contigs", counts.in_both);
647            }
648            if counts.fasta_only > 0 {
649                println!("   - FASTA-only (decoy/HLA): {} contigs", counts.fasta_only);
650            }
651            if counts.report_only > 0 {
652                println!(
653                    "   - Report-only (not in FASTA): {} contigs",
654                    counts.report_only
655                );
656            }
657        }
658
659        // Verbose details
660        if verbose {
661            println!("\n   Query contigs: {}", query.contigs.len());
662            let md5_count = query.contigs.iter().filter(|c| c.md5.is_some()).count();
663            println!("   Query contigs with MD5: {md5_count}");
664        }
665    }
666
667    println!();
668}
669
670fn print_hierarchical_json_results(matches: &[HierarchicalMatchResult]) -> anyhow::Result<()> {
671    let output: Vec<serde_json::Value> = matches
672        .iter()
673        .map(|m| {
674            serde_json::json!({
675                "distribution": {
676                    "id": m.distribution_id,
677                    "display_name": m.display_name,
678                },
679                "assembly": {
680                    "id": m.assembly_id,
681                    "name": m.assembly_name,
682                    "version_id": m.version_id,
683                    "version": m.version_string,
684                },
685                "match_type": format!("{:?}", m.match_type),
686                "score": m.score,
687                "matched_contigs": m.matched_contigs,
688                "total_query_contigs": m.total_query_contigs,
689                "total_distribution_contigs": m.total_distribution_contigs,
690                "extra_in_query": m.extra_in_query,
691                "missing_from_query": m.missing_from_query,
692                "presence_counts": {
693                    "in_both": m.presence_counts.in_both,
694                    "fasta_only": m.presence_counts.fasta_only,
695                    "report_only": m.presence_counts.report_only,
696                },
697            })
698        })
699        .collect();
700
701    println!("{}", serde_json::to_string_pretty(&output)?);
702    Ok(())
703}
704
705fn print_hierarchical_tsv_results(matches: &[HierarchicalMatchResult]) {
706    println!("rank\tdistribution_id\tdisplay_name\tassembly_id\tversion_id\tmatch_type\tscore\tmatched\tquery_total\tdist_total\tin_both\tfasta_only");
707    for (i, m) in matches.iter().enumerate() {
708        println!(
709            "{}\t{}\t{}\t{}\t{}\t{:?}\t{:.4}\t{}\t{}\t{}\t{}\t{}",
710            i + 1,
711            m.distribution_id,
712            m.display_name,
713            m.assembly_id,
714            m.version_id,
715            m.match_type,
716            m.score,
717            m.matched_contigs,
718            m.total_query_contigs,
719            m.total_distribution_contigs,
720            m.presence_counts.in_both,
721            m.presence_counts.fasta_only,
722        );
723    }
724}