Skip to main content

ref_solver/catalog/
store.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::Path;
4use thiserror::Error;
5
6use crate::core::reference::KnownReference;
7use crate::core::types::ReferenceId;
8
9#[derive(Error, Debug)]
10pub enum CatalogError {
11    #[error("Failed to read catalog: {0}")]
12    ReadError(#[from] std::io::Error),
13
14    #[error("Failed to parse catalog: {0}")]
15    ParseError(#[from] serde_json::Error),
16}
17
18/// Catalog version for compatibility checking
19pub const CATALOG_VERSION: &str = "1.0.0";
20
21/// Serializable catalog format
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct CatalogData {
24    pub version: String,
25    pub created_at: String,
26    pub references: Vec<KnownReference>,
27}
28
29/// The main reference catalog with indexes
30#[derive(Debug)]
31pub struct ReferenceCatalog {
32    /// All known references
33    pub references: Vec<KnownReference>,
34
35    /// Index: reference ID -> index in references vec
36    id_to_index: HashMap<ReferenceId, usize>,
37
38    /// Index: MD5 -> indices of references containing this MD5
39    pub md5_to_refs: HashMap<String, Vec<usize>>,
40
41    /// Index: sha512t24u digest -> indices of references containing this digest
42    pub sha512t24u_to_refs: HashMap<String, Vec<usize>>,
43
44    /// Index: (`exact_name`, length) -> indices of references
45    pub name_length_to_refs: HashMap<(String, u64), Vec<usize>>,
46
47    /// Index: (alias, length) -> indices of references
48    /// Separate from `name_length_to_refs` to distinguish primary names from aliases
49    pub alias_length_to_refs: HashMap<(String, u64), Vec<usize>>,
50
51    /// Index: signature -> reference index (for exact matches)
52    signature_to_ref: HashMap<String, usize>,
53}
54
55impl ReferenceCatalog {
56    /// Create an empty catalog
57    #[must_use]
58    pub fn new() -> Self {
59        Self {
60            references: Vec::new(),
61            id_to_index: HashMap::new(),
62            md5_to_refs: HashMap::new(),
63            sha512t24u_to_refs: HashMap::new(),
64            name_length_to_refs: HashMap::new(),
65            alias_length_to_refs: HashMap::new(),
66            signature_to_ref: HashMap::new(),
67        }
68    }
69
70    /// Load the embedded default catalog
71    ///
72    /// # Errors
73    ///
74    /// Returns `CatalogError::Json` if the embedded catalog is invalid.
75    pub fn load_embedded() -> Result<Self, CatalogError> {
76        // Embedded at compile time via build.rs
77        const EMBEDDED_CATALOG: &str = include_str!("../../catalogs/human_references.json");
78        Self::from_json(EMBEDDED_CATALOG)
79    }
80
81    /// Load catalog from a JSON file
82    ///
83    /// # Errors
84    ///
85    /// Returns `CatalogError::Io` if the file cannot be read, or
86    /// `CatalogError::Json` if parsing fails.
87    pub fn load_from_file(path: &Path) -> Result<Self, CatalogError> {
88        let content = std::fs::read_to_string(path)?;
89        Self::from_json(&content)
90    }
91
92    /// Parse catalog from JSON string
93    ///
94    /// # Errors
95    ///
96    /// Returns `CatalogError::Json` if the JSON is invalid.
97    pub fn from_json(json: &str) -> Result<Self, CatalogError> {
98        let data: CatalogData = serde_json::from_str(json)?;
99
100        // Version check (warn but don't fail)
101        if data.version != CATALOG_VERSION {
102            eprintln!(
103                "Warning: Catalog version mismatch (expected {}, found {})",
104                CATALOG_VERSION, data.version
105            );
106        }
107
108        let mut catalog = Self::new();
109        for mut reference in data.references {
110            reference.rebuild_indexes();
111            catalog.add_reference(reference);
112        }
113
114        Ok(catalog)
115    }
116
117    /// Add a reference to the catalog
118    pub fn add_reference(&mut self, reference: KnownReference) {
119        let index = self.references.len();
120
121        // Index by ID
122        self.id_to_index.insert(reference.id.clone(), index);
123
124        // Index by MD5s
125        for md5 in &reference.md5_set {
126            self.md5_to_refs.entry(md5.clone()).or_default().push(index);
127        }
128
129        // Index by sha512t24u digests
130        for digest in &reference.sha512t24u_set {
131            self.sha512t24u_to_refs
132                .entry(digest.clone())
133                .or_default()
134                .push(index);
135        }
136
137        // Index by (name, length) pairs
138        for (name, length) in &reference.name_length_set {
139            self.name_length_to_refs
140                .entry((name.clone(), *length))
141                .or_default()
142                .push(index);
143        }
144
145        // Index by (alias, length) pairs
146        for contig in &reference.contigs {
147            for alias in &contig.aliases {
148                self.alias_length_to_refs
149                    .entry((alias.clone(), contig.length))
150                    .or_default()
151                    .push(index);
152            }
153        }
154
155        // Index by signature
156        if let Some(sig) = &reference.signature {
157            self.signature_to_ref.insert(sig.clone(), index);
158        }
159
160        self.references.push(reference);
161    }
162
163    /// Get a reference by ID
164    #[must_use]
165    pub fn get(&self, id: &ReferenceId) -> Option<&KnownReference> {
166        self.id_to_index.get(id).map(|&idx| &self.references[idx])
167    }
168
169    /// Find exact match by signature
170    #[must_use]
171    pub fn find_by_signature(&self, signature: &str) -> Option<&KnownReference> {
172        self.signature_to_ref
173            .get(signature)
174            .map(|&idx| &self.references[idx])
175    }
176
177    /// Export catalog to JSON
178    ///
179    /// # Errors
180    ///
181    /// Returns `CatalogError::Json` if serialization fails.
182    pub fn to_json(&self) -> Result<String, CatalogError> {
183        let data = CatalogData {
184            version: CATALOG_VERSION.to_string(),
185            created_at: chrono::Utc::now().to_rfc3339(),
186            references: self.references.clone(),
187        };
188        Ok(serde_json::to_string_pretty(&data)?)
189    }
190
191    /// Number of references in catalog
192    #[must_use]
193    pub fn len(&self) -> usize {
194        self.references.len()
195    }
196
197    /// Check if catalog is empty
198    #[must_use]
199    pub fn is_empty(&self) -> bool {
200        self.references.is_empty()
201    }
202}
203
204impl Default for ReferenceCatalog {
205    fn default() -> Self {
206        Self::new()
207    }
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213    use crate::core::contig::Contig;
214    use crate::core::types::{Assembly, ReferenceSource};
215
216    #[test]
217    fn test_load_embedded_catalog() {
218        let catalog = ReferenceCatalog::load_embedded().unwrap();
219        assert!(!catalog.is_empty());
220    }
221
222    #[test]
223    fn test_catalog_get_by_id() {
224        let catalog = ReferenceCatalog::load_embedded().unwrap();
225
226        let hg38 = catalog.get(&ReferenceId::new("hg38_ucsc"));
227        assert!(hg38.is_some());
228        let hg38 = hg38.unwrap();
229        assert_eq!(hg38.display_name, "hg38 (UCSC)");
230        assert!(!hg38.contigs.is_empty());
231    }
232
233    #[test]
234    fn test_catalog_get_nonexistent() {
235        let catalog = ReferenceCatalog::load_embedded().unwrap();
236        let result = catalog.get(&ReferenceId::new("nonexistent_ref"));
237        assert!(result.is_none());
238    }
239
240    #[test]
241    fn test_catalog_to_json() {
242        let catalog = ReferenceCatalog::load_embedded().unwrap();
243        let json = catalog.to_json().unwrap();
244
245        assert!(json.contains("\"version\""));
246        assert!(json.contains("\"references\""));
247        assert!(json.contains("hg38_ucsc"));
248    }
249
250    #[test]
251    fn test_add_reference() {
252        let mut catalog = ReferenceCatalog::new();
253        assert_eq!(catalog.len(), 0);
254
255        let contigs = vec![Contig::new("chr1", 100).with_md5("abc123")];
256        let mut reference = KnownReference::new(
257            "test_ref",
258            "Test Reference",
259            Assembly::Grch38,
260            ReferenceSource::Custom("test".to_string()),
261        );
262        reference = reference.with_contigs(contigs);
263
264        catalog.add_reference(reference);
265        assert_eq!(catalog.len(), 1);
266
267        let retrieved = catalog.get(&ReferenceId::new("test_ref"));
268        assert!(retrieved.is_some());
269        assert_eq!(retrieved.unwrap().display_name, "Test Reference");
270    }
271
272    #[test]
273    fn test_name_length_set_contains_aliases() {
274        // Verify that aliases are added to name_length_set during rebuild_indexes
275        let catalog = ReferenceCatalog::load_embedded().unwrap();
276
277        // Find grch38_ncbi which has aliases (NCBI accession numbers with UCSC aliases)
278        let ref38 = catalog
279            .references
280            .iter()
281            .find(|r| r.id.0 == "grch38_ncbi")
282            .expect("grch38_ncbi should exist");
283
284        // The reference should have contigs with aliases
285        // So name_length_set should have more entries than just contigs
286        assert!(
287            ref38.name_length_set.len() > ref38.contigs.len(),
288            "name_length_set should contain more entries than contigs (should include aliases), \
289             got {} vs {} contigs",
290            ref38.name_length_set.len(),
291            ref38.contigs.len()
292        );
293
294        // Check that chr1 is in name_length_set (alias of NC_000001.11)
295        let chr1_key = ("chr1".to_string(), 248_956_422u64);
296        assert!(
297            ref38.name_length_set.contains(&chr1_key),
298            "chr1 should be in name_length_set as an alias"
299        );
300
301        // Debug: print name_length_set size
302        println!(
303            "name_length_set size: {} (contigs: {})",
304            ref38.name_length_set.len(),
305            ref38.contigs.len(),
306        );
307    }
308
309    #[test]
310    fn test_alias_indexing() {
311        // Test that contig aliases are indexed in name_length_to_refs
312        // so query names can match catalog aliases
313        let mut catalog = ReferenceCatalog::new();
314
315        // Create a reference with NCBI names and UCSC aliases
316        let contigs = vec![
317            Contig::new("NC_000001.11", 248_956_422)
318                .with_md5("6aef897c3d6ff0c78aff06ac189178dd")
319                .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
320            Contig::new("NC_000002.12", 242_193_529)
321                .with_md5("f98db672eb0993dcfdabafe2a882905c")
322                .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
323        ];
324
325        let reference = KnownReference::new(
326            "test_ncbi_ref",
327            "Test NCBI Reference",
328            Assembly::Grch38,
329            ReferenceSource::Custom("test".to_string()),
330        )
331        .with_contigs(contigs);
332
333        catalog.add_reference(reference);
334
335        // Verify primary names are indexed
336        assert!(
337            catalog
338                .name_length_to_refs
339                .contains_key(&("NC_000001.11".to_string(), 248_956_422)),
340            "Primary name should be indexed"
341        );
342
343        // Verify aliases are also indexed in name_length_to_refs
344        // This is CRITICAL for matching UCSC queries against NCBI references
345        assert!(
346            catalog
347                .name_length_to_refs
348                .contains_key(&("chr1".to_string(), 248_956_422)),
349            "Alias 'chr1' should be indexed in name_length_to_refs"
350        );
351        assert!(
352            catalog
353                .name_length_to_refs
354                .contains_key(&("1".to_string(), 248_956_422)),
355            "Alias '1' should be indexed in name_length_to_refs"
356        );
357        assert!(
358            catalog
359                .name_length_to_refs
360                .contains_key(&("chr2".to_string(), 242_193_529)),
361            "Alias 'chr2' should be indexed in name_length_to_refs"
362        );
363    }
364}