Skip to main content

ref_solver/matching/
hierarchical_engine.rs

1//! Hierarchical matching engine for the new catalog structure.
2//!
3//! This engine matches queries against the hierarchical catalog which
4//! contains Assembly → `AssemblyVersion` → `FastaDistribution` → `FastaContig`.
5
6use std::collections::HashMap;
7
8use crate::catalog::hierarchical::{CatalogIndex, HierarchicalCatalog};
9use crate::core::assembly::PresenceCounts;
10use crate::core::header::QueryHeader;
11use crate::core::types::MatchType;
12
13/// Helper function to convert usize count to f64 with explicit precision loss allowance
14#[inline]
15fn count_to_f64(count: usize) -> f64 {
16    #[allow(clippy::cast_precision_loss)]
17    {
18        count as f64
19    }
20}
21
22/// Result of matching against the hierarchical catalog
23#[derive(Debug, Clone)]
24pub struct HierarchicalMatchResult {
25    /// The matched distribution
26    pub distribution_id: String,
27    /// Display name of the distribution
28    pub display_name: String,
29    /// Assembly ID (empty for standalone distributions)
30    pub assembly_id: String,
31    /// Assembly name
32    pub assembly_name: String,
33    /// Version ID
34    pub version_id: String,
35    /// Version string
36    pub version_string: String,
37    /// Match type
38    pub match_type: MatchType,
39    /// Match score (0.0 - 1.0)
40    pub score: f64,
41    /// Number of query contigs matched
42    pub matched_contigs: usize,
43    /// Total query contigs
44    pub total_query_contigs: usize,
45    /// Number of distribution contigs
46    pub total_distribution_contigs: usize,
47    /// Presence breakdown
48    pub presence_counts: PresenceCounts,
49    /// Extra contigs in query not in distribution
50    pub extra_in_query: usize,
51    /// Missing contigs from query that are in distribution
52    pub missing_from_query: usize,
53}
54
55impl HierarchicalMatchResult {
56    /// Get match percentage
57    #[must_use]
58    pub fn match_percentage(&self) -> f64 {
59        self.score * 100.0
60    }
61
62    /// Check if this is an exact match
63    #[cfg(test)]
64    #[must_use]
65    pub fn is_exact(&self) -> bool {
66        matches!(self.match_type, MatchType::Exact)
67    }
68}
69
70/// Matching engine for hierarchical catalogs
71pub struct HierarchicalMatchingEngine<'a> {
72    catalog: &'a HierarchicalCatalog,
73    index: CatalogIndex,
74    min_score: f64,
75}
76
77impl<'a> HierarchicalMatchingEngine<'a> {
78    /// Create a new engine from a hierarchical catalog
79    #[must_use]
80    pub fn new(catalog: &'a HierarchicalCatalog) -> Self {
81        let index = catalog.build_index();
82        Self {
83            catalog,
84            index,
85            min_score: 0.1,
86        }
87    }
88
89    /// Find the best matching distributions for a query
90    #[must_use]
91    pub fn find_matches(&self, query: &QueryHeader, limit: usize) -> Vec<HierarchicalMatchResult> {
92        // Score each distribution based on MD5 and name+length matches
93        let mut scores: HashMap<String, (usize, usize)> = HashMap::new(); // dist_id -> (md5_matches, name_len_matches)
94
95        // Count matches for each distribution
96        for contig in &query.contigs {
97            // Try MD5 matching first
98            if let Some(md5) = &contig.md5 {
99                for location in self.index.find_by_md5(md5) {
100                    let entry = scores
101                        .entry(location.distribution_id.clone())
102                        .or_insert((0, 0));
103                    entry.0 += 1;
104                }
105            }
106
107            // Also try name+length matching
108            for location in self.index.find_by_name_length(&contig.name, contig.length) {
109                let entry = scores
110                    .entry(location.distribution_id.clone())
111                    .or_insert((0, 0));
112                entry.1 += 1;
113            }
114        }
115
116        // Build results
117        let mut results: Vec<HierarchicalMatchResult> = scores
118            .into_iter()
119            .filter_map(|(dist_id, (md5_matches, name_len_matches))| {
120                self.build_result(&dist_id, query, md5_matches, name_len_matches)
121            })
122            .filter(|r| r.score >= self.min_score)
123            .collect();
124
125        // Sort by score descending
126        results.sort_by(|a, b| {
127            b.score
128                .partial_cmp(&a.score)
129                .unwrap_or(std::cmp::Ordering::Equal)
130        });
131
132        results.truncate(limit);
133        results
134    }
135
136    /// Build a match result for a distribution
137    fn build_result(
138        &self,
139        dist_id: &str,
140        query: &QueryHeader,
141        md5_matches: usize,
142        name_len_matches: usize,
143    ) -> Option<HierarchicalMatchResult> {
144        let dist_ref = self.catalog.get_distribution(dist_id)?;
145        let dist = dist_ref.distribution;
146
147        // Use the better of MD5 or name+length matches
148        let matched = md5_matches.max(name_len_matches);
149
150        // Calculate Jaccard similarity
151        let query_size = query.contigs.len();
152        let dist_size = dist.contigs.len();
153        let union = query_size + dist_size - matched;
154        let score = if union > 0 {
155            count_to_f64(matched) / count_to_f64(union)
156        } else {
157            0.0
158        };
159
160        // Determine match type
161        let match_type = if matched == query_size && matched == dist_size {
162            MatchType::Exact
163        } else if matched > 0 {
164            MatchType::Partial
165        } else {
166            MatchType::NoMatch
167        };
168
169        let presence_counts = dist.presence_counts();
170
171        Some(HierarchicalMatchResult {
172            distribution_id: dist_id.to_string(),
173            display_name: dist.display_name.clone(),
174            assembly_id: dist_ref.assembly.map(|a| a.id.clone()).unwrap_or_default(),
175            assembly_name: dist_ref
176                .assembly
177                .map(|a| a.name.clone())
178                .unwrap_or_default(),
179            version_id: dist_ref.version.map(|v| v.id.clone()).unwrap_or_default(),
180            version_string: dist_ref
181                .version
182                .map(|v| v.version.clone())
183                .unwrap_or_default(),
184            match_type,
185            score,
186            matched_contigs: matched,
187            total_query_contigs: query_size,
188            total_distribution_contigs: dist_size,
189            presence_counts,
190            extra_in_query: query_size.saturating_sub(matched),
191            missing_from_query: dist_size.saturating_sub(matched),
192        })
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use crate::core::assembly::{
200        AssemblyVersion, FastaContig, FastaDistribution, HierarchicalAssembly, ReportSource,
201    };
202    use crate::core::contig::Contig;
203    use crate::core::types::ReferenceSource;
204
205    fn make_test_catalog() -> HierarchicalCatalog {
206        HierarchicalCatalog {
207            version: "1.0.0".to_string(),
208            created_at: "2024-01-01T00:00:00Z".to_string(),
209            assemblies: vec![HierarchicalAssembly {
210                id: "grch38".to_string(),
211                name: "GRCh38".to_string(),
212                organism: "Homo sapiens".to_string(),
213                versions: vec![AssemblyVersion {
214                    id: "grch38_p14".to_string(),
215                    version: "p14".to_string(),
216                    source: ReportSource::Ncbi {
217                        accession: "GCF_000001405.40".to_string(),
218                        url: None,
219                        date: None,
220                    },
221                    report_contigs: vec![],
222                    fasta_distributions: vec![
223                        FastaDistribution {
224                            id: "hg38_ucsc".to_string(),
225                            display_name: "hg38 (UCSC)".to_string(),
226                            source: ReferenceSource::Ucsc,
227                            download_url: None,
228                            tags: vec![],
229                            contigs: vec![
230                                FastaContig::new(
231                                    "chr1",
232                                    248_956_422,
233                                    "6aef897c3d6ff0c78aff06ac189178dd",
234                                ),
235                                FastaContig::new(
236                                    "chr2",
237                                    242_193_529,
238                                    "f98db672eb0993dcfdabafe2a882905c",
239                                ),
240                            ],
241                        },
242                        FastaDistribution {
243                            id: "hs38DH".to_string(),
244                            display_name: "hs38DH (1KG)".to_string(),
245                            source: ReferenceSource::OneThousandGenomes,
246                            download_url: None,
247                            tags: vec![],
248                            contigs: vec![
249                                FastaContig::new(
250                                    "chr1",
251                                    248_956_422,
252                                    "6aef897c3d6ff0c78aff06ac189178dd",
253                                ),
254                                FastaContig::new(
255                                    "chr2",
256                                    242_193_529,
257                                    "f98db672eb0993dcfdabafe2a882905c",
258                                ),
259                                FastaContig::new(
260                                    "chr1_decoy",
261                                    5000,
262                                    "decoy_md5_hash_here_12345678",
263                                ),
264                            ],
265                        },
266                    ],
267                }],
268            }],
269            standalone_distributions: vec![],
270        }
271    }
272
273    #[test]
274    fn test_exact_match() {
275        let catalog = make_test_catalog();
276        let engine = HierarchicalMatchingEngine::new(&catalog);
277
278        // Query exactly matching hg38_ucsc
279        let query = QueryHeader::new(vec![
280            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
281            Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
282        ]);
283
284        let matches = engine.find_matches(&query, 5);
285        assert!(!matches.is_empty());
286
287        // Should find hg38_ucsc as exact match
288        let hg38 = matches.iter().find(|m| m.distribution_id == "hg38_ucsc");
289        assert!(hg38.is_some());
290        let hg38 = hg38.unwrap();
291        assert!(hg38.is_exact());
292        assert_eq!(hg38.assembly_name, "GRCh38");
293    }
294
295    #[test]
296    fn test_subset_match() {
297        let catalog = make_test_catalog();
298        let engine = HierarchicalMatchingEngine::new(&catalog);
299
300        // Query with only chr1
301        let query = QueryHeader::new(vec![
302            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd")
303        ]);
304
305        let matches = engine.find_matches(&query, 5);
306        assert!(!matches.is_empty());
307
308        // Query is a partial match (subset of both distributions)
309        let best = &matches[0];
310        assert!(matches!(best.match_type, MatchType::Partial));
311        assert_eq!(best.matched_contigs, 1);
312        assert!(best.missing_from_query > 0);
313    }
314
315    #[test]
316    fn test_superset_match() {
317        let catalog = make_test_catalog();
318        let engine = HierarchicalMatchingEngine::new(&catalog);
319
320        // Query with extra contig not in hg38_ucsc
321        let query = QueryHeader::new(vec![
322            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd"),
323            Contig::new("chr2", 242_193_529).with_md5("f98db672eb0993dcfdabafe2a882905c"),
324            Contig::new("chr1_decoy", 5000).with_md5("decoy_md5_hash_here_12345678"),
325        ]);
326
327        let matches = engine.find_matches(&query, 5);
328        assert!(!matches.is_empty());
329
330        // Should be exact match for hs38DH
331        let hs38dh = matches.iter().find(|m| m.distribution_id == "hs38DH");
332        assert!(hs38dh.is_some());
333        assert!(hs38dh.unwrap().is_exact());
334
335        // hg38_ucsc is a partial match (query has extra contigs)
336        let hg38 = matches.iter().find(|m| m.distribution_id == "hg38_ucsc");
337        assert!(hg38.is_some());
338        assert!(matches!(hg38.unwrap().match_type, MatchType::Partial));
339        assert!(hg38.unwrap().extra_in_query > 0);
340    }
341
342    #[test]
343    fn test_name_length_fallback() {
344        let catalog = make_test_catalog();
345        let engine = HierarchicalMatchingEngine::new(&catalog);
346
347        // Query without MD5
348        let query = QueryHeader::new(vec![
349            Contig::new("chr1", 248_956_422),
350            Contig::new("chr2", 242_193_529),
351        ]);
352
353        let matches = engine.find_matches(&query, 5);
354        assert!(!matches.is_empty());
355
356        // Should still find matches via name+length
357        let best = &matches[0];
358        assert!(best.score > 0.5);
359    }
360
361    #[test]
362    fn test_assembly_info_populated() {
363        let catalog = make_test_catalog();
364        let engine = HierarchicalMatchingEngine::new(&catalog);
365
366        let query = QueryHeader::new(vec![
367            Contig::new("chr1", 248_956_422).with_md5("6aef897c3d6ff0c78aff06ac189178dd")
368        ]);
369
370        let matches = engine.find_matches(&query, 5);
371        let best = &matches[0];
372
373        assert_eq!(best.assembly_id, "grch38");
374        assert_eq!(best.assembly_name, "GRCh38");
375        assert_eq!(best.version_id, "grch38_p14");
376        assert_eq!(best.version_string, "p14");
377    }
378}