Skip to main content

ref_solver/catalog/
store.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::Path;
4use thiserror::Error;
5
6use crate::core::reference::KnownReference;
7use crate::core::types::ReferenceId;
8
9#[derive(Error, Debug)]
10pub enum CatalogError {
11    #[error("Failed to read catalog: {0}")]
12    ReadError(#[from] std::io::Error),
13
14    #[error("Failed to parse catalog: {0}")]
15    ParseError(#[from] serde_json::Error),
16}
17
18/// Catalog version for compatibility checking
19pub const CATALOG_VERSION: &str = "1.0.0";
20
21/// Serializable catalog format
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct CatalogData {
24    pub version: String,
25    pub created_at: String,
26    pub references: Vec<KnownReference>,
27}
28
29/// The main reference catalog with indexes
30#[derive(Debug)]
31pub struct ReferenceCatalog {
32    /// All known references
33    pub references: Vec<KnownReference>,
34
35    /// Index: reference ID -> index in references vec
36    id_to_index: HashMap<ReferenceId, usize>,
37
38    /// Index: MD5 -> indices of references containing this MD5
39    pub md5_to_refs: HashMap<String, Vec<usize>>,
40
41    /// Index: (`exact_name`, length) -> indices of references
42    pub name_length_to_refs: HashMap<(String, u64), Vec<usize>>,
43
44    /// Index: (alias, length) -> indices of references
45    /// Separate from `name_length_to_refs` to distinguish primary names from aliases
46    pub alias_length_to_refs: HashMap<(String, u64), Vec<usize>>,
47
48    /// Index: signature -> reference index (for exact matches)
49    signature_to_ref: HashMap<String, usize>,
50}
51
52impl ReferenceCatalog {
53    /// Create an empty catalog
54    #[must_use]
55    pub fn new() -> Self {
56        Self {
57            references: Vec::new(),
58            id_to_index: HashMap::new(),
59            md5_to_refs: HashMap::new(),
60            name_length_to_refs: HashMap::new(),
61            alias_length_to_refs: HashMap::new(),
62            signature_to_ref: HashMap::new(),
63        }
64    }
65
66    /// Load the embedded default catalog
67    ///
68    /// # Errors
69    ///
70    /// Returns `CatalogError::Json` if the embedded catalog is invalid.
71    pub fn load_embedded() -> Result<Self, CatalogError> {
72        // Embedded at compile time via build.rs
73        const EMBEDDED_CATALOG: &str = include_str!("../../catalogs/human_references.json");
74        Self::from_json(EMBEDDED_CATALOG)
75    }
76
77    /// Load catalog from a JSON file
78    ///
79    /// # Errors
80    ///
81    /// Returns `CatalogError::Io` if the file cannot be read, or
82    /// `CatalogError::Json` if parsing fails.
83    pub fn load_from_file(path: &Path) -> Result<Self, CatalogError> {
84        let content = std::fs::read_to_string(path)?;
85        Self::from_json(&content)
86    }
87
88    /// Parse catalog from JSON string
89    ///
90    /// # Errors
91    ///
92    /// Returns `CatalogError::Json` if the JSON is invalid.
93    pub fn from_json(json: &str) -> Result<Self, CatalogError> {
94        let data: CatalogData = serde_json::from_str(json)?;
95
96        // Version check (warn but don't fail)
97        if data.version != CATALOG_VERSION {
98            eprintln!(
99                "Warning: Catalog version mismatch (expected {}, found {})",
100                CATALOG_VERSION, data.version
101            );
102        }
103
104        let mut catalog = Self::new();
105        for mut reference in data.references {
106            reference.rebuild_indexes();
107            catalog.add_reference(reference);
108        }
109
110        Ok(catalog)
111    }
112
113    /// Add a reference to the catalog
114    pub fn add_reference(&mut self, reference: KnownReference) {
115        let index = self.references.len();
116
117        // Index by ID
118        self.id_to_index.insert(reference.id.clone(), index);
119
120        // Index by MD5s
121        for md5 in &reference.md5_set {
122            self.md5_to_refs.entry(md5.clone()).or_default().push(index);
123        }
124
125        // Index by (name, length) pairs
126        for (name, length) in &reference.name_length_set {
127            self.name_length_to_refs
128                .entry((name.clone(), *length))
129                .or_default()
130                .push(index);
131        }
132
133        // Index by (alias, length) pairs
134        for contig in &reference.contigs {
135            for alias in &contig.aliases {
136                self.alias_length_to_refs
137                    .entry((alias.clone(), contig.length))
138                    .or_default()
139                    .push(index);
140            }
141        }
142
143        // Index by signature
144        if let Some(sig) = &reference.signature {
145            self.signature_to_ref.insert(sig.clone(), index);
146        }
147
148        self.references.push(reference);
149    }
150
151    /// Get a reference by ID
152    #[must_use]
153    pub fn get(&self, id: &ReferenceId) -> Option<&KnownReference> {
154        self.id_to_index.get(id).map(|&idx| &self.references[idx])
155    }
156
157    /// Find exact match by signature
158    #[must_use]
159    pub fn find_by_signature(&self, signature: &str) -> Option<&KnownReference> {
160        self.signature_to_ref
161            .get(signature)
162            .map(|&idx| &self.references[idx])
163    }
164
165    /// Export catalog to JSON
166    ///
167    /// # Errors
168    ///
169    /// Returns `CatalogError::Json` if serialization fails.
170    pub fn to_json(&self) -> Result<String, CatalogError> {
171        let data = CatalogData {
172            version: CATALOG_VERSION.to_string(),
173            created_at: chrono::Utc::now().to_rfc3339(),
174            references: self.references.clone(),
175        };
176        Ok(serde_json::to_string_pretty(&data)?)
177    }
178
179    /// Number of references in catalog
180    #[must_use]
181    pub fn len(&self) -> usize {
182        self.references.len()
183    }
184
185    /// Check if catalog is empty
186    #[must_use]
187    pub fn is_empty(&self) -> bool {
188        self.references.is_empty()
189    }
190}
191
192impl Default for ReferenceCatalog {
193    fn default() -> Self {
194        Self::new()
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201    use crate::core::contig::Contig;
202    use crate::core::types::{Assembly, ReferenceSource};
203
204    #[test]
205    fn test_load_embedded_catalog() {
206        let catalog = ReferenceCatalog::load_embedded().unwrap();
207        assert!(!catalog.is_empty());
208    }
209
210    #[test]
211    fn test_catalog_get_by_id() {
212        let catalog = ReferenceCatalog::load_embedded().unwrap();
213
214        let hg38 = catalog.get(&ReferenceId::new("hg38_ucsc"));
215        assert!(hg38.is_some());
216        let hg38 = hg38.unwrap();
217        assert_eq!(hg38.display_name, "hg38 (UCSC)");
218        assert!(!hg38.contigs.is_empty());
219    }
220
221    #[test]
222    fn test_catalog_get_nonexistent() {
223        let catalog = ReferenceCatalog::load_embedded().unwrap();
224        let result = catalog.get(&ReferenceId::new("nonexistent_ref"));
225        assert!(result.is_none());
226    }
227
228    #[test]
229    fn test_catalog_to_json() {
230        let catalog = ReferenceCatalog::load_embedded().unwrap();
231        let json = catalog.to_json().unwrap();
232
233        assert!(json.contains("\"version\""));
234        assert!(json.contains("\"references\""));
235        assert!(json.contains("hg38_ucsc"));
236    }
237
238    #[test]
239    fn test_add_reference() {
240        let mut catalog = ReferenceCatalog::new();
241        assert_eq!(catalog.len(), 0);
242
243        let contigs = vec![Contig::new("chr1", 100).with_md5("abc123")];
244        let mut reference = KnownReference::new(
245            "test_ref",
246            "Test Reference",
247            Assembly::Grch38,
248            ReferenceSource::Custom("test".to_string()),
249        );
250        reference = reference.with_contigs(contigs);
251
252        catalog.add_reference(reference);
253        assert_eq!(catalog.len(), 1);
254
255        let retrieved = catalog.get(&ReferenceId::new("test_ref"));
256        assert!(retrieved.is_some());
257        assert_eq!(retrieved.unwrap().display_name, "Test Reference");
258    }
259
260    #[test]
261    fn test_name_length_set_contains_aliases() {
262        // Verify that aliases are added to name_length_set during rebuild_indexes
263        let catalog = ReferenceCatalog::load_embedded().unwrap();
264
265        // Find grch38_ncbi which has aliases (NCBI accession numbers with UCSC aliases)
266        let ref38 = catalog
267            .references
268            .iter()
269            .find(|r| r.id.0 == "grch38_ncbi")
270            .expect("grch38_ncbi should exist");
271
272        // The reference should have contigs with aliases
273        // So name_length_set should have more entries than just contigs
274        assert!(
275            ref38.name_length_set.len() > ref38.contigs.len(),
276            "name_length_set should contain more entries than contigs (should include aliases), \
277             got {} vs {} contigs",
278            ref38.name_length_set.len(),
279            ref38.contigs.len()
280        );
281
282        // Check that chr1 is in name_length_set (alias of NC_000001.11)
283        let chr1_key = ("chr1".to_string(), 248_956_422u64);
284        assert!(
285            ref38.name_length_set.contains(&chr1_key),
286            "chr1 should be in name_length_set as an alias"
287        );
288
289        // Debug: print name_length_set size
290        println!(
291            "name_length_set size: {} (contigs: {})",
292            ref38.name_length_set.len(),
293            ref38.contigs.len(),
294        );
295    }
296
297    #[test]
298    fn test_alias_indexing() {
299        // Test that contig aliases are indexed in name_length_to_refs
300        // so query names can match catalog aliases
301        let mut catalog = ReferenceCatalog::new();
302
303        // Create a reference with NCBI names and UCSC aliases
304        let contigs = vec![
305            Contig::new("NC_000001.11", 248_956_422)
306                .with_md5("6aef897c3d6ff0c78aff06ac189178dd")
307                .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
308            Contig::new("NC_000002.12", 242_193_529)
309                .with_md5("f98db672eb0993dcfdabafe2a882905c")
310                .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
311        ];
312
313        let reference = KnownReference::new(
314            "test_ncbi_ref",
315            "Test NCBI Reference",
316            Assembly::Grch38,
317            ReferenceSource::Custom("test".to_string()),
318        )
319        .with_contigs(contigs);
320
321        catalog.add_reference(reference);
322
323        // Verify primary names are indexed
324        assert!(
325            catalog
326                .name_length_to_refs
327                .contains_key(&("NC_000001.11".to_string(), 248_956_422)),
328            "Primary name should be indexed"
329        );
330
331        // Verify aliases are also indexed in name_length_to_refs
332        // This is CRITICAL for matching UCSC queries against NCBI references
333        assert!(
334            catalog
335                .name_length_to_refs
336                .contains_key(&("chr1".to_string(), 248_956_422)),
337            "Alias 'chr1' should be indexed in name_length_to_refs"
338        );
339        assert!(
340            catalog
341                .name_length_to_refs
342                .contains_key(&("1".to_string(), 248_956_422)),
343            "Alias '1' should be indexed in name_length_to_refs"
344        );
345        assert!(
346            catalog
347                .name_length_to_refs
348                .contains_key(&("chr2".to_string(), 242_193_529)),
349            "Alias 'chr2' should be indexed in name_length_to_refs"
350        );
351    }
352}