1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::Path;
4use thiserror::Error;
5
6use crate::core::reference::KnownReference;
7use crate::core::types::ReferenceId;
8
9#[derive(Error, Debug)]
10pub enum CatalogError {
11 #[error("Failed to read catalog: {0}")]
12 ReadError(#[from] std::io::Error),
13
14 #[error("Failed to parse catalog: {0}")]
15 ParseError(#[from] serde_json::Error),
16}
17
18pub const CATALOG_VERSION: &str = "1.0.0";
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct CatalogData {
24 pub version: String,
25 pub created_at: String,
26 pub references: Vec<KnownReference>,
27}
28
29#[derive(Debug)]
31pub struct ReferenceCatalog {
32 pub references: Vec<KnownReference>,
34
35 id_to_index: HashMap<ReferenceId, usize>,
37
38 pub md5_to_refs: HashMap<String, Vec<usize>>,
40
41 pub sha512t24u_to_refs: HashMap<String, Vec<usize>>,
43
44 pub name_length_to_refs: HashMap<(String, u64), Vec<usize>>,
46
47 pub alias_length_to_refs: HashMap<(String, u64), Vec<usize>>,
50
51 signature_to_ref: HashMap<String, usize>,
53}
54
55impl ReferenceCatalog {
56 #[must_use]
58 pub fn new() -> Self {
59 Self {
60 references: Vec::new(),
61 id_to_index: HashMap::new(),
62 md5_to_refs: HashMap::new(),
63 sha512t24u_to_refs: HashMap::new(),
64 name_length_to_refs: HashMap::new(),
65 alias_length_to_refs: HashMap::new(),
66 signature_to_ref: HashMap::new(),
67 }
68 }
69
70 pub fn load_embedded() -> Result<Self, CatalogError> {
76 const EMBEDDED_CATALOG: &str = include_str!("../../catalogs/human_references.json");
78 Self::from_json(EMBEDDED_CATALOG)
79 }
80
81 pub fn load_from_file(path: &Path) -> Result<Self, CatalogError> {
88 let content = std::fs::read_to_string(path)?;
89 Self::from_json(&content)
90 }
91
92 pub fn from_json(json: &str) -> Result<Self, CatalogError> {
98 let data: CatalogData = serde_json::from_str(json)?;
99
100 if data.version != CATALOG_VERSION {
102 eprintln!(
103 "Warning: Catalog version mismatch (expected {}, found {})",
104 CATALOG_VERSION, data.version
105 );
106 }
107
108 let mut catalog = Self::new();
109 for mut reference in data.references {
110 reference.rebuild_indexes();
111 catalog.add_reference(reference);
112 }
113
114 Ok(catalog)
115 }
116
117 pub fn add_reference(&mut self, reference: KnownReference) {
119 let index = self.references.len();
120
121 self.id_to_index.insert(reference.id.clone(), index);
123
124 for md5 in &reference.md5_set {
126 self.md5_to_refs.entry(md5.clone()).or_default().push(index);
127 }
128
129 for digest in &reference.sha512t24u_set {
131 self.sha512t24u_to_refs
132 .entry(digest.clone())
133 .or_default()
134 .push(index);
135 }
136
137 for (name, length) in &reference.name_length_set {
139 self.name_length_to_refs
140 .entry((name.clone(), *length))
141 .or_default()
142 .push(index);
143 }
144
145 for contig in &reference.contigs {
147 for alias in &contig.aliases {
148 self.alias_length_to_refs
149 .entry((alias.clone(), contig.length))
150 .or_default()
151 .push(index);
152 }
153 }
154
155 if let Some(sig) = &reference.signature {
157 self.signature_to_ref.insert(sig.clone(), index);
158 }
159
160 self.references.push(reference);
161 }
162
163 #[must_use]
165 pub fn get(&self, id: &ReferenceId) -> Option<&KnownReference> {
166 self.id_to_index.get(id).map(|&idx| &self.references[idx])
167 }
168
169 #[must_use]
171 pub fn find_by_signature(&self, signature: &str) -> Option<&KnownReference> {
172 self.signature_to_ref
173 .get(signature)
174 .map(|&idx| &self.references[idx])
175 }
176
177 pub fn to_json(&self) -> Result<String, CatalogError> {
183 let data = CatalogData {
184 version: CATALOG_VERSION.to_string(),
185 created_at: chrono::Utc::now().to_rfc3339(),
186 references: self.references.clone(),
187 };
188 Ok(serde_json::to_string_pretty(&data)?)
189 }
190
191 #[must_use]
193 pub fn len(&self) -> usize {
194 self.references.len()
195 }
196
197 #[must_use]
199 pub fn is_empty(&self) -> bool {
200 self.references.is_empty()
201 }
202}
203
204impl Default for ReferenceCatalog {
205 fn default() -> Self {
206 Self::new()
207 }
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213 use crate::core::contig::Contig;
214 use crate::core::types::{Assembly, ReferenceSource};
215
216 #[test]
217 fn test_load_embedded_catalog() {
218 let catalog = ReferenceCatalog::load_embedded().unwrap();
219 assert!(!catalog.is_empty());
220 }
221
222 #[test]
223 fn test_catalog_get_by_id() {
224 let catalog = ReferenceCatalog::load_embedded().unwrap();
225
226 let hg38 = catalog.get(&ReferenceId::new("hg38_ucsc"));
227 assert!(hg38.is_some());
228 let hg38 = hg38.unwrap();
229 assert_eq!(hg38.display_name, "hg38 (UCSC)");
230 assert!(!hg38.contigs.is_empty());
231 }
232
233 #[test]
234 fn test_catalog_get_nonexistent() {
235 let catalog = ReferenceCatalog::load_embedded().unwrap();
236 let result = catalog.get(&ReferenceId::new("nonexistent_ref"));
237 assert!(result.is_none());
238 }
239
240 #[test]
241 fn test_catalog_to_json() {
242 let catalog = ReferenceCatalog::load_embedded().unwrap();
243 let json = catalog.to_json().unwrap();
244
245 assert!(json.contains("\"version\""));
246 assert!(json.contains("\"references\""));
247 assert!(json.contains("hg38_ucsc"));
248 }
249
250 #[test]
251 fn test_add_reference() {
252 let mut catalog = ReferenceCatalog::new();
253 assert_eq!(catalog.len(), 0);
254
255 let contigs = vec![Contig::new("chr1", 100).with_md5("abc123")];
256 let mut reference = KnownReference::new(
257 "test_ref",
258 "Test Reference",
259 Assembly::Grch38,
260 ReferenceSource::Custom("test".to_string()),
261 );
262 reference = reference.with_contigs(contigs);
263
264 catalog.add_reference(reference);
265 assert_eq!(catalog.len(), 1);
266
267 let retrieved = catalog.get(&ReferenceId::new("test_ref"));
268 assert!(retrieved.is_some());
269 assert_eq!(retrieved.unwrap().display_name, "Test Reference");
270 }
271
272 #[test]
273 fn test_name_length_set_contains_aliases() {
274 let catalog = ReferenceCatalog::load_embedded().unwrap();
276
277 let ref38 = catalog
279 .references
280 .iter()
281 .find(|r| r.id.0 == "grch38_ncbi")
282 .expect("grch38_ncbi should exist");
283
284 assert!(
287 ref38.name_length_set.len() > ref38.contigs.len(),
288 "name_length_set should contain more entries than contigs (should include aliases), \
289 got {} vs {} contigs",
290 ref38.name_length_set.len(),
291 ref38.contigs.len()
292 );
293
294 let chr1_key = ("chr1".to_string(), 248_956_422u64);
296 assert!(
297 ref38.name_length_set.contains(&chr1_key),
298 "chr1 should be in name_length_set as an alias"
299 );
300
301 println!(
303 "name_length_set size: {} (contigs: {})",
304 ref38.name_length_set.len(),
305 ref38.contigs.len(),
306 );
307 }
308
309 #[test]
310 fn test_alias_indexing() {
311 let mut catalog = ReferenceCatalog::new();
314
315 let contigs = vec![
317 Contig::new("NC_000001.11", 248_956_422)
318 .with_md5("6aef897c3d6ff0c78aff06ac189178dd")
319 .with_aliases(vec!["chr1".to_string(), "1".to_string()]),
320 Contig::new("NC_000002.12", 242_193_529)
321 .with_md5("f98db672eb0993dcfdabafe2a882905c")
322 .with_aliases(vec!["chr2".to_string(), "2".to_string()]),
323 ];
324
325 let reference = KnownReference::new(
326 "test_ncbi_ref",
327 "Test NCBI Reference",
328 Assembly::Grch38,
329 ReferenceSource::Custom("test".to_string()),
330 )
331 .with_contigs(contigs);
332
333 catalog.add_reference(reference);
334
335 assert!(
337 catalog
338 .name_length_to_refs
339 .contains_key(&("NC_000001.11".to_string(), 248_956_422)),
340 "Primary name should be indexed"
341 );
342
343 assert!(
346 catalog
347 .name_length_to_refs
348 .contains_key(&("chr1".to_string(), 248_956_422)),
349 "Alias 'chr1' should be indexed in name_length_to_refs"
350 );
351 assert!(
352 catalog
353 .name_length_to_refs
354 .contains_key(&("1".to_string(), 248_956_422)),
355 "Alias '1' should be indexed in name_length_to_refs"
356 );
357 assert!(
358 catalog
359 .name_length_to_refs
360 .contains_key(&("chr2".to_string(), 242_193_529)),
361 "Alias 'chr2' should be indexed in name_length_to_refs"
362 );
363 }
364}