ref_solver/catalog/
store.rs1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::Path;
4use thiserror::Error;
5
6use crate::core::reference::KnownReference;
7use crate::core::types::ReferenceId;
8
9#[derive(Error, Debug)]
10pub enum CatalogError {
11 #[error("Failed to read catalog: {0}")]
12 ReadError(#[from] std::io::Error),
13
14 #[error("Failed to parse catalog: {0}")]
15 ParseError(#[from] serde_json::Error),
16}
17
18pub const CATALOG_VERSION: &str = "1.0.0";
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct CatalogData {
24 pub version: String,
25 pub created_at: String,
26 pub references: Vec<KnownReference>,
27}
28
29#[derive(Debug)]
31pub struct ReferenceCatalog {
32 pub references: Vec<KnownReference>,
34
35 id_to_index: HashMap<ReferenceId, usize>,
37
38 pub md5_to_refs: HashMap<String, Vec<usize>>,
40
41 pub name_length_to_refs: HashMap<(String, u64), Vec<usize>>,
43
44 pub alias_length_to_refs: HashMap<(String, u64), Vec<usize>>,
47
48 signature_to_ref: HashMap<String, usize>,
50}
51
52impl ReferenceCatalog {
53 #[must_use]
55 pub fn new() -> Self {
56 Self {
57 references: Vec::new(),
58 id_to_index: HashMap::new(),
59 md5_to_refs: HashMap::new(),
60 name_length_to_refs: HashMap::new(),
61 alias_length_to_refs: HashMap::new(),
62 signature_to_ref: HashMap::new(),
63 }
64 }
65
66 pub fn load_embedded() -> Result<Self, CatalogError> {
72 const EMBEDDED_CATALOG: &str = include_str!("../../catalogs/human_references.json");
74 Self::from_json(EMBEDDED_CATALOG)
75 }
76
77 pub fn load_from_file(path: &Path) -> Result<Self, CatalogError> {
84 let content = std::fs::read_to_string(path)?;
85 Self::from_json(&content)
86 }
87
88 pub fn from_json(json: &str) -> Result<Self, CatalogError> {
94 let data: CatalogData = serde_json::from_str(json)?;
95
96 if data.version != CATALOG_VERSION {
98 eprintln!(
99 "Warning: Catalog version mismatch (expected {}, found {})",
100 CATALOG_VERSION, data.version
101 );
102 }
103
104 let mut catalog = Self::new();
105 for mut reference in data.references {
106 reference.rebuild_indexes();
107 catalog.add_reference(reference);
108 }
109
110 Ok(catalog)
111 }
112
113 pub fn add_reference(&mut self, reference: KnownReference) {
115 let index = self.references.len();
116
117 self.id_to_index.insert(reference.id.clone(), index);
119
120 for md5 in &reference.md5_set {
122 self.md5_to_refs.entry(md5.clone()).or_default().push(index);
123 }
124
125 for (name, length) in &reference.name_length_set {
127 self.name_length_to_refs
128 .entry((name.clone(), *length))
129 .or_default()
130 .push(index);
131 }
132
133 for contig in &reference.contigs {
135 for alias in &contig.aliases {
136 self.alias_length_to_refs
137 .entry((alias.clone(), contig.length))
138 .or_default()
139 .push(index);
140 }
141 }
142
143 if let Some(sig) = &reference.signature {
145 self.signature_to_ref.insert(sig.clone(), index);
146 }
147
148 self.references.push(reference);
149 }
150
151 #[must_use]
153 pub fn get(&self, id: &ReferenceId) -> Option<&KnownReference> {
154 self.id_to_index.get(id).map(|&idx| &self.references[idx])
155 }
156
157 #[must_use]
159 pub fn find_by_signature(&self, signature: &str) -> Option<&KnownReference> {
160 self.signature_to_ref
161 .get(signature)
162 .map(|&idx| &self.references[idx])
163 }
164
165 pub fn to_json(&self) -> Result<String, CatalogError> {
171 let data = CatalogData {
172 version: CATALOG_VERSION.to_string(),
173 created_at: chrono::Utc::now().to_rfc3339(),
174 references: self.references.clone(),
175 };
176 Ok(serde_json::to_string_pretty(&data)?)
177 }
178
179 #[must_use]
181 pub fn len(&self) -> usize {
182 self.references.len()
183 }
184
185 #[must_use]
187 pub fn is_empty(&self) -> bool {
188 self.references.is_empty()
189 }
190}
191
192impl Default for ReferenceCatalog {
193 fn default() -> Self {
194 Self::new()
195 }
196}
197
198#[cfg(test)]
199mod tests {
200 use super::*;
201 use crate::core::contig::Contig;
202 use crate::core::types::{Assembly, ReferenceSource};
203
204 #[test]
205 fn test_load_embedded_catalog() {
206 let catalog = ReferenceCatalog::load_embedded().unwrap();
207 assert!(!catalog.is_empty());
208 }
209
210 #[test]
211 fn test_catalog_get_by_id() {
212 let catalog = ReferenceCatalog::load_embedded().unwrap();
213
214 let hg38 = catalog.get(&ReferenceId::new("hg38_ucsc"));
215 assert!(hg38.is_some());
216 let hg38 = hg38.unwrap();
217 assert_eq!(hg38.display_name, "hg38 (UCSC)");
218 assert!(!hg38.contigs.is_empty());
219 }
220
221 #[test]
222 fn test_catalog_get_nonexistent() {
223 let catalog = ReferenceCatalog::load_embedded().unwrap();
224 let result = catalog.get(&ReferenceId::new("nonexistent_ref"));
225 assert!(result.is_none());
226 }
227
228 #[test]
229 fn test_catalog_to_json() {
230 let catalog = ReferenceCatalog::load_embedded().unwrap();
231 let json = catalog.to_json().unwrap();
232
233 assert!(json.contains("\"version\""));
234 assert!(json.contains("\"references\""));
235 assert!(json.contains("hg38_ucsc"));
236 }
237
238 #[test]
239 fn test_add_reference() {
240 let mut catalog = ReferenceCatalog::new();
241 assert_eq!(catalog.len(), 0);
242
243 let contigs = vec![Contig::new("chr1", 100).with_md5("abc123")];
244 let mut reference = KnownReference::new(
245 "test_ref",
246 "Test Reference",
247 Assembly::Grch38,
248 ReferenceSource::Custom("test".to_string()),
249 );
250 reference = reference.with_contigs(contigs);
251
252 catalog.add_reference(reference);
253 assert_eq!(catalog.len(), 1);
254
255 let retrieved = catalog.get(&ReferenceId::new("test_ref"));
256 assert!(retrieved.is_some());
257 assert_eq!(retrieved.unwrap().display_name, "Test Reference");
258 }
259
260 #[test]
261 fn test_name_length_set_contains_aliases() {
262 let catalog = ReferenceCatalog::load_embedded().unwrap();
264
265 let ref38 = catalog
267 .references
268 .iter()
269 .find(|r| r.id.0 == "grch38_ncbi")
270 .expect("grch38_ncbi should exist");
271
272 assert!(
275 ref38.name_length_set.len() > ref38.contigs.len(),
276 "name_length_set should contain more entries than contigs (should include aliases), \
277 got {} vs {} contigs",
278 ref38.name_length_set.len(),
279 ref38.contigs.len()
280 );
281
282 let chr1_key = ("chr1".to_string(), 248_956_422u64);
284 assert!(
285 ref38.name_length_set.contains(&chr1_key),
286 "chr1 should be in name_length_set as an alias"
287 );
288
289 println!(
291 "name_length_set size: {} (contigs: {})",
292 ref38.name_length_set.len(),
293 ref38.contigs.len(),
294 );
295 }
296
297 #[test]
298 fn test_alias_indexing() {
299 let mut catalog = ReferenceCatalog::new();
302
303 let contigs = vec![
305 Contig::new("NC_000001.11", 248_956_422)
306 .with_md5("6aef897c3d6ff0c78aff06ac189178dd")
307 .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
308 Contig::new("NC_000002.12", 242_193_529)
309 .with_md5("f98db672eb0993dcfdabafe2a882905c")
310 .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
311 ];
312
313 let reference = KnownReference::new(
314 "test_ncbi_ref",
315 "Test NCBI Reference",
316 Assembly::Grch38,
317 ReferenceSource::Custom("test".to_string()),
318 )
319 .with_contigs(contigs);
320
321 catalog.add_reference(reference);
322
323 assert!(
325 catalog
326 .name_length_to_refs
327 .contains_key(&("NC_000001.11".to_string(), 248_956_422)),
328 "Primary name should be indexed"
329 );
330
331 assert!(
334 catalog
335 .name_length_to_refs
336 .contains_key(&("chr1".to_string(), 248_956_422)),
337 "Alias 'chr1' should be indexed in name_length_to_refs"
338 );
339 assert!(
340 catalog
341 .name_length_to_refs
342 .contains_key(&("1".to_string(), 248_956_422)),
343 "Alias '1' should be indexed in name_length_to_refs"
344 );
345 assert!(
346 catalog
347 .name_length_to_refs
348 .contains_key(&("chr2".to_string(), 242_193_529)),
349 "Alias 'chr2' should be indexed in name_length_to_refs"
350 );
351 }
352}