Skip to main content

ref_solver/cli/
compare.rs

1use std::path::{Path, PathBuf};
2
3use clap::Args;
4
5use crate::catalog::store::ReferenceCatalog;
6use crate::cli::OutputFormat;
7use crate::core::header::QueryHeader;
8use crate::core::reference::KnownReference;
9use crate::core::types::{Assembly, ReferenceSource};
10use crate::matching::scoring::MatchScore;
11use crate::parsing;
12
13#[derive(Args)]
14pub struct CompareArgs {
15    /// First input file (BAM, SAM, CRAM, .dict, or TSV)
16    #[arg(required = true)]
17    pub input_a: PathBuf,
18
19    /// Second input file, or reference ID from catalog
20    #[arg(required = true)]
21    pub input_b: String,
22
23    /// Treat second argument as a reference ID from the catalog
24    #[arg(long)]
25    pub reference: bool,
26
27    /// Path to custom catalog file (only used with --reference)
28    #[arg(long)]
29    pub catalog: Option<PathBuf>,
30}
31
32/// Execute compare subcommand
33///
34/// # Errors
35///
36/// Returns an error if inputs cannot be parsed or comparison fails.
37#[allow(clippy::needless_pass_by_value)] // CLI entry point, values from clap
38pub fn run(args: CompareArgs, format: OutputFormat, verbose: bool) -> anyhow::Result<()> {
39    // Parse first input
40    let query_a = parse_input(&args.input_a)?;
41
42    if verbose {
43        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] // Percentage 0-100
44        let md5_pct = (query_a.md5_coverage() * 100.0) as u32;
45        eprintln!(
46            "Input A: {} contigs ({md5_pct}% have MD5)",
47            query_a.contigs.len(),
48        );
49    }
50
51    // Parse second input
52    let query_b = if args.reference {
53        // Look up in catalog
54        let catalog = if let Some(path) = &args.catalog {
55            ReferenceCatalog::load_from_file(path)?
56        } else {
57            ReferenceCatalog::load_embedded()?
58        };
59
60        let ref_id = crate::core::types::ReferenceId::new(&args.input_b);
61        let reference = catalog
62            .get(&ref_id)
63            .ok_or_else(|| anyhow::anyhow!("Reference '{}' not found in catalog", args.input_b))?;
64
65        // Convert reference to query header for comparison
66        QueryHeader::new(reference.contigs.clone())
67    } else {
68        // Parse as file
69        let path = PathBuf::from(&args.input_b);
70        parse_input(&path)?
71    };
72
73    if verbose {
74        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] // Percentage 0-100
75        let md5_pct = (query_b.md5_coverage() * 100.0) as u32;
76        eprintln!(
77            "Input B: {} contigs ({md5_pct}% have MD5)",
78            query_b.contigs.len(),
79        );
80    }
81
82    // Compare - use builder pattern to ensure indexes are properly computed
83    let reference_b = KnownReference::new(
84        "input_b",
85        &args.input_b,
86        Assembly::Other("unknown".to_string()),
87        ReferenceSource::Custom("input".to_string()),
88    )
89    .with_contigs(query_b.contigs.clone());
90
91    let score = MatchScore::calculate(&query_a, &reference_b);
92
93    // Output results
94    match format {
95        OutputFormat::Text => print_text_comparison(&args, &query_a, &query_b, &score),
96        OutputFormat::Json => print_json_comparison(&args, &query_a, &query_b, &score)?,
97        OutputFormat::Tsv => print_tsv_comparison(&score),
98    }
99
100    Ok(())
101}
102
103fn parse_input(path: &Path) -> anyhow::Result<QueryHeader> {
104    let ext = path
105        .extension()
106        .and_then(|e| e.to_str())
107        .map(str::to_lowercase);
108
109    match ext.as_deref() {
110        Some("dict") => Ok(parsing::dict::parse_dict_file(path)?),
111        Some("tsv") => Ok(parsing::tsv::parse_tsv_file(path, '\t')?),
112        Some("csv") => Ok(parsing::tsv::parse_tsv_file(path, ',')?),
113        // Default to SAM/BAM/CRAM parsing for bam, sam, cram, and unknown extensions
114        _ => Ok(parsing::sam::parse_file(path)?),
115    }
116}
117
118fn print_text_comparison(
119    args: &CompareArgs,
120    query_a: &QueryHeader,
121    query_b: &QueryHeader,
122    score: &MatchScore,
123) {
124    println!("Comparison Results");
125    println!("{}", "=".repeat(60));
126
127    println!("\nInput A: {}", args.input_a.display());
128    println!("  Contigs: {}", query_a.contigs.len());
129    println!("  MD5 coverage: {:.0}%", query_a.md5_coverage() * 100.0);
130    println!("  Naming convention: {:?}", query_a.naming_convention);
131
132    println!("\nInput B: {}", args.input_b);
133    println!("  Contigs: {}", query_b.contigs.len());
134    println!("  MD5 coverage: {:.0}%", query_b.md5_coverage() * 100.0);
135    println!("  Naming convention: {:?}", query_b.naming_convention);
136
137    println!("\nSimilarity Scores:");
138    println!("  MD5 Jaccard: {:.2}%", score.md5_jaccard * 100.0);
139    println!(
140        "  Name+Length Jaccard: {:.2}%",
141        score.name_length_jaccard * 100.0
142    );
143    println!(
144        "  MD5 Query Coverage: {:.2}%",
145        score.md5_query_coverage * 100.0
146    );
147    println!(
148        "  Name+Length Query Coverage: {:.2}%",
149        score.name_length_query_coverage * 100.0
150    );
151    println!("  Order Preserved: {}", score.order_preserved);
152    println!("  Order Score: {:.2}%", score.order_score * 100.0);
153    println!("  Composite Score: {:.2}%", score.composite * 100.0);
154    println!("  Confidence: {:?}", score.confidence);
155}
156
157fn print_json_comparison(
158    args: &CompareArgs,
159    query_a: &QueryHeader,
160    query_b: &QueryHeader,
161    score: &MatchScore,
162) -> anyhow::Result<()> {
163    let output = serde_json::json!({
164        "input_a": {
165            "path": args.input_a.display().to_string(),
166            "contig_count": query_a.contigs.len(),
167            "md5_coverage": query_a.md5_coverage(),
168            "naming_convention": format!("{:?}", query_a.naming_convention),
169        },
170        "input_b": {
171            "path": args.input_b,
172            "contig_count": query_b.contigs.len(),
173            "md5_coverage": query_b.md5_coverage(),
174            "naming_convention": format!("{:?}", query_b.naming_convention),
175        },
176        "score": {
177            "md5_jaccard": score.md5_jaccard,
178            "name_length_jaccard": score.name_length_jaccard,
179            "md5_query_coverage": score.md5_query_coverage,
180            "name_length_query_coverage": score.name_length_query_coverage,
181            "order_preserved": score.order_preserved,
182            "order_score": score.order_score,
183            "composite": score.composite,
184            "confidence": format!("{:?}", score.confidence),
185        }
186    });
187
188    println!("{}", serde_json::to_string_pretty(&output)?);
189    Ok(())
190}
191
192fn print_tsv_comparison(score: &MatchScore) {
193    println!(
194        "md5_jaccard\tname_length_jaccard\tmd5_query_coverage\torder_preserved\tcomposite\tconfidence"
195    );
196    println!(
197        "{:.4}\t{:.4}\t{:.4}\t{}\t{:.4}\t{:?}",
198        score.md5_jaccard,
199        score.name_length_jaccard,
200        score.md5_query_coverage,
201        score.order_preserved,
202        score.composite,
203        score.confidence,
204    );
205}