1use super::*;
4use super::alias::AliasManager;
5
6use std::collections::HashMap;
7use std::fmt::{Display, Formatter};
8use std::path::{Path, PathBuf};
9
10use indexmap::IndexMap;
11
12use anyhow::{anyhow, Context, Result};
13
14use crate::collection::{read_rgsi_file, SequenceMetadataExt, SequenceRecordExt};
15use crate::digest::lookup_alphabet;
16use crate::digest::{
17 SequenceCollectionMetadata, SequenceCollectionRecord, SequenceMetadata,
18 SequenceRecord,
19};
20use crate::digest::{decode_string_from_bytes, decode_substring_from_bytes, encode_sequence};
21use crate::hashkeyable::{DigestKey, HashKeyable, key_to_digest_string};
22use crate::seqcol::metadata_matches_attribute;
23
24use std::fs::{self, create_dir_all};
25
26#[derive(Debug)]
35pub struct ReadonlyRefgetStore {
36 pub(crate) sequence_store: HashMap<DigestKey, SequenceRecord>,
38 pub(crate) md5_lookup: HashMap<DigestKey, DigestKey>,
40
41 pub(crate) name_lookup: HashMap<DigestKey, IndexMap<String, DigestKey>>,
43 pub(crate) collections: HashMap<DigestKey, SequenceCollectionRecord>,
45 pub(crate) mode: StorageMode,
47 pub(crate) local_path: Option<PathBuf>,
49 pub(crate) remote_source: Option<String>,
51 pub(crate) seqdata_path_template: Option<String>,
53 pub(crate) persist_to_disk: bool,
55 pub(crate) quiet: bool,
57 pub(crate) ancillary_digests: bool,
60 pub(crate) attribute_index: bool,
63 pub(crate) aliases: AliasManager,
65 pub(crate) fhr_metadata: HashMap<DigestKey, super::fhr_metadata::FhrMetadata>,
67 pub(crate) available_sequence_alias_namespaces: Vec<String>,
69 pub(crate) available_collection_alias_namespaces: Vec<String>,
71 pub(crate) decoded_cache: HashMap<DigestKey, Vec<u8>>,
74}
75
76impl ReadonlyRefgetStore {
77 pub(crate) fn new(mode: StorageMode) -> Self {
80 ReadonlyRefgetStore {
81 sequence_store: HashMap::new(),
82 md5_lookup: HashMap::new(),
83 name_lookup: HashMap::new(),
84 collections: HashMap::new(),
85 mode,
86 local_path: None,
87 remote_source: None,
88 seqdata_path_template: None,
89 persist_to_disk: false,
90 quiet: false,
91 ancillary_digests: true,
92 attribute_index: false,
93 aliases: AliasManager::default(),
94 fhr_metadata: HashMap::new(),
95 decoded_cache: HashMap::new(),
96 available_sequence_alias_namespaces: Vec::new(),
97 available_collection_alias_namespaces: Vec::new(),
98 }
99 }
100
101 pub fn set_quiet(&mut self, quiet: bool) {
103 self.quiet = quiet;
104 }
105
106 pub fn is_quiet(&self) -> bool {
108 self.quiet
109 }
110
111 pub fn store_exists<P: AsRef<Path>>(path: P) -> bool {
113 path.as_ref().join("rgstore.json").exists()
114 }
115
116 pub fn set_encoding_mode(&mut self, new_mode: StorageMode) {
118 if self.mode == new_mode {
119 return;
120 }
121
122 for record in self.sequence_store.values_mut() {
123 match record {
124 SequenceRecord::Full { metadata, sequence } => {
125 match (self.mode, new_mode) {
126 (StorageMode::Raw, StorageMode::Encoded) => {
127 let alphabet = lookup_alphabet(&metadata.alphabet);
128 *sequence = encode_sequence(&*sequence, alphabet);
129 }
130 (StorageMode::Encoded, StorageMode::Raw) => {
131 let alphabet = lookup_alphabet(&metadata.alphabet);
132 *sequence =
133 decode_string_from_bytes(&*sequence, metadata.length, alphabet);
134 }
135 _ => {}
136 }
137 }
138 SequenceRecord::Stub(_) => {}
139 }
140 }
141
142 self.mode = new_mode;
143 }
144
145 pub fn enable_encoding(&mut self) {
147 self.set_encoding_mode(StorageMode::Encoded);
148 }
149
150 pub fn disable_encoding(&mut self) {
152 self.set_encoding_mode(StorageMode::Raw);
153 }
154
155 pub fn enable_persistence<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
157 let path = path.as_ref();
158
159 self.local_path = Some(path.to_path_buf());
160 self.persist_to_disk = true;
161 self.seqdata_path_template
162 .get_or_insert_with(|| DEFAULT_SEQDATA_PATH_TEMPLATE.to_string());
163
164 create_dir_all(path.join("sequences"))?;
165 create_dir_all(path.join("collections"))?;
166
167 let keys: Vec<DigestKey> = self.sequence_store.keys().cloned().collect();
168 for key in keys {
169 if let Some(SequenceRecord::Full { metadata, sequence }) = self.sequence_store.get(&key)
170 {
171 self.write_sequence_to_disk_single(metadata, sequence)?;
172 let stub = SequenceRecord::Stub(metadata.clone());
173 self.sequence_store.insert(key, stub);
174 }
175 }
176
177 for record in self.collections.values() {
178 self.write_collection_to_disk_single(record)?;
179 }
180
181 self.write_index_files()?;
182
183 Ok(())
184 }
185
186 pub fn disable_persistence(&mut self) {
188 self.persist_to_disk = false;
189 }
190
191 pub fn is_persisting(&self) -> bool {
193 self.persist_to_disk
194 }
195
196 pub fn add_sequence<T: Into<Option<DigestKey>>>(
198 &mut self,
199 sequence_record: SequenceRecord,
200 collection_digest: T,
201 force: bool,
202 ) -> Result<()> {
203 let collection_digest = collection_digest
204 .into()
205 .ok_or_else(|| anyhow::anyhow!("Collection digest is required"))?;
206 self.collections.get(&collection_digest).ok_or_else(|| {
207 anyhow::anyhow!("Collection not found for digest: {:?}", collection_digest)
208 })?;
209
210 let metadata = sequence_record.metadata();
211
212 self.name_lookup
213 .entry(collection_digest)
214 .or_default()
215 .insert(metadata.name.clone(), metadata.sha512t24u.to_key());
216
217 self.add_sequence_record(sequence_record, force)?;
218
219 Ok(())
220 }
221
222 pub fn add_sequence_collection(
224 &mut self,
225 collection: crate::digest::SequenceCollection,
226 ) -> Result<()> {
227 self.add_sequence_collection_internal(collection, false)
228 }
229
230 pub fn add_sequence_collection_force(
232 &mut self,
233 collection: crate::digest::SequenceCollection,
234 ) -> Result<()> {
235 self.add_sequence_collection_internal(collection, true)
236 }
237
238 pub(crate) fn add_sequence_collection_internal(
240 &mut self,
241 collection: crate::digest::SequenceCollection,
242 force: bool,
243 ) -> Result<()> {
244 let coll_digest = collection.metadata.digest.to_key();
245
246 if !force && self.collections.contains_key(&coll_digest) {
247 return Ok(());
248 }
249
250 let crate::digest::SequenceCollection { metadata, sequences } = collection;
251
252 let record = SequenceCollectionRecord::Full {
253 metadata: metadata.clone(),
254 sequences: sequences.iter().map(|s| SequenceRecord::Stub(s.metadata().clone())).collect(),
255 };
256
257 if self.persist_to_disk && self.local_path.is_some() {
258 self.write_collection_to_disk_single(&record)?;
259 }
260
261 self.collections.insert(coll_digest, record);
262
263 for sequence_record in sequences {
264 self.add_sequence(sequence_record, coll_digest, force)?;
265 }
266
267 if self.persist_to_disk && self.local_path.is_some() {
268 self.write_index_files()?;
269 }
270
271 Ok(())
272 }
273
274 pub fn add_sequence_record(&mut self, sr: SequenceRecord, force: bool) -> Result<()> {
276 let metadata = sr.metadata();
277 let key = metadata.sha512t24u.to_key();
278
279 if !force && self.sequence_store.contains_key(&key) {
280 return Ok(());
281 }
282
283 self.md5_lookup
284 .insert(metadata.md5.to_key(), metadata.sha512t24u.to_key());
285
286 if self.persist_to_disk && self.local_path.is_some() {
287 match &sr {
288 SequenceRecord::Full { metadata, sequence } => {
289 self.write_sequence_to_disk_single(metadata, sequence)?;
290 let stub = SequenceRecord::Stub(metadata.clone());
291 self.sequence_store.insert(key, stub);
292 return Ok(());
293 }
294 SequenceRecord::Stub(_) => {}
295 }
296 }
297
298 self.sequence_store.insert(key, sr);
299 Ok(())
300 }
301
302 pub fn sequence_digests(&self) -> impl Iterator<Item = DigestKey> + '_ {
308 self.sequence_store.keys().cloned()
309 }
310
311 pub fn sequence_metadata(&self) -> impl Iterator<Item = &SequenceMetadata> + '_ {
313 self.sequence_store.values().map(|rec| rec.metadata())
314 }
315
316 pub fn total_disk_size(&self) -> usize {
318 self.sequence_store
319 .values()
320 .map(|rec| rec.metadata().disk_size(&self.mode))
321 .sum()
322 }
323
324 pub fn actual_disk_usage(&self) -> usize {
326 let Some(path) = &self.local_path else {
327 return 0;
328 };
329
330 fn dir_size(path: &std::path::Path) -> usize {
331 let mut total = 0;
332 if let Ok(entries) = std::fs::read_dir(path) {
333 for entry in entries.flatten() {
334 let path = entry.path();
335 if path.is_file() {
336 total += entry.metadata().map(|m| m.len() as usize).unwrap_or(0);
337 } else if path.is_dir() {
338 total += dir_size(&path);
339 }
340 }
341 }
342 total
343 }
344
345 dir_size(path)
346 }
347
348 pub fn list_collections(
354 &self,
355 page: usize,
356 page_size: usize,
357 filters: &[(&str, &str)],
358 ) -> Result<PagedResult<SequenceCollectionMetadata>> {
359 let mut filtered: Vec<SequenceCollectionMetadata> = Vec::new();
360 for record in self.collections.values() {
361 let meta = record.metadata();
362 let mut passes = true;
363 for &(attr_name, attr_digest) in filters {
364 if !metadata_matches_attribute(meta, attr_name, attr_digest)? {
365 passes = false;
366 break;
367 }
368 }
369 if passes {
370 filtered.push(meta.clone());
371 }
372 }
373
374 filtered.sort_by(|a, b| a.digest.cmp(&b.digest));
375
376 let total = filtered.len();
377 let start = page * page_size;
378 let results = if start < total {
379 filtered.into_iter().skip(start).take(page_size).collect()
380 } else {
381 Vec::new()
382 };
383
384 Ok(PagedResult {
385 results,
386 pagination: Pagination {
387 page,
388 page_size,
389 total,
390 },
391 })
392 }
393
394 pub fn get_collection_metadata<K: AsRef<[u8]>>(
396 &self,
397 collection_digest: K,
398 ) -> Option<&SequenceCollectionMetadata> {
399 let key = collection_digest.to_key();
400 self.collections.get(&key).map(|record| record.metadata())
401 }
402
403 pub fn get_collection(&self, collection_digest: &str) -> Result<crate::digest::SequenceCollection> {
405 let key = collection_digest.to_key();
406
407 if !self.name_lookup.contains_key(&key) {
408 return Err(anyhow!(
409 "Collection not loaded: {}. Call load_collection() or load_all_collections() first.",
410 collection_digest
411 ));
412 }
413
414 let metadata = self
415 .collections
416 .get(&key)
417 .ok_or_else(|| anyhow!("Collection not found: {}", collection_digest))?
418 .metadata()
419 .clone();
420
421 let sequences: Vec<SequenceRecord> = self
424 .name_lookup
425 .get(&key)
426 .map(|name_map| {
427 name_map
428 .iter()
429 .filter_map(|(name, seq_key)| {
430 let record = self.sequence_store.get(seq_key)?;
431 let mut meta = record.metadata().clone();
432 meta.name = name.clone();
433 Some(match record.sequence() {
434 Some(seq) => SequenceRecord::Full {
435 metadata: meta,
436 sequence: seq.to_vec(),
437 },
438 None => SequenceRecord::Stub(meta),
439 })
440 })
441 .collect()
442 })
443 .unwrap_or_default();
444
445 Ok(crate::digest::SequenceCollection {
446 metadata,
447 sequences,
448 })
449 }
450
451 pub fn remove_collection(
453 &mut self,
454 digest: &str,
455 remove_orphan_sequences: bool,
456 ) -> Result<bool> {
457 let key = digest.to_key();
458
459 if self.collections.remove(&key).is_none() {
460 return Ok(false);
461 }
462
463 let orphan_candidates: Vec<DigestKey> = self
464 .name_lookup
465 .get(&key)
466 .map(|name_map| name_map.values().cloned().collect())
467 .unwrap_or_default();
468
469 self.name_lookup.remove(&key);
470 self.fhr_metadata.remove(&key);
471
472 let alias_pairs = self.aliases.reverse_lookup_collection(digest);
474 let affected_namespaces: std::collections::HashSet<String> = alias_pairs
475 .iter()
476 .map(|(ns, _)| ns.clone())
477 .collect();
478 for (ns, alias) in &alias_pairs {
479 self.aliases.remove_collection(ns, alias);
480 }
481 for ns in &affected_namespaces {
482 self.persist_alias_namespace(AliasKind::Collection, ns)?;
483 }
484
485 if remove_orphan_sequences && !orphan_candidates.is_empty() {
486 let mut still_referenced: std::collections::HashSet<DigestKey> =
487 std::collections::HashSet::new();
488 for name_map in self.name_lookup.values() {
489 for seq_key in name_map.values() {
490 still_referenced.insert(*seq_key);
491 }
492 }
493
494 let orphans: Vec<DigestKey> = orphan_candidates
495 .into_iter()
496 .filter(|k| !still_referenced.contains(k))
497 .collect();
498
499 for orphan_key in &orphans {
500 self.sequence_store.remove(orphan_key);
501 self.md5_lookup.retain(|_, v| v != orphan_key);
502 self.decoded_cache.remove(orphan_key);
503 }
504
505 if self.persist_to_disk {
506 if let (Some(local_path), Some(template)) =
507 (&self.local_path, &self.seqdata_path_template)
508 {
509 for orphan_key in &orphans {
510 let orphan_digest = key_to_digest_string(orphan_key);
511 let seq_file_path = Self::expand_template(&orphan_digest, template);
512 let full_path = local_path.join(&seq_file_path);
513 let _ = fs::remove_file(&full_path);
514 if let Some(parent) = full_path.parent() {
515 let _ = fs::remove_dir(parent);
516 }
517 }
518 }
519 }
520 }
521
522 if self.persist_to_disk {
523 if let Some(local_path) = &self.local_path {
524 let rgsi_path = local_path.join(format!("collections/{}.rgsi", digest));
525 let _ = fs::remove_file(&rgsi_path);
526 let fhr_path = local_path.join(format!("fhr/{}.fhr.json", digest));
527 let _ = fs::remove_file(&fhr_path);
528 }
529 self.write_index_files()?;
530 }
531
532 Ok(true)
533 }
534
535 pub fn import_collection(&mut self, source: &ReadonlyRefgetStore, digest: &str) -> Result<()> {
545 let collection = source.get_collection(digest)?;
546 self.add_sequence_collection(collection)?;
547
548 let coll_key = digest.to_key();
550 if let Some(name_map) = source.name_lookup.get(&coll_key) {
551 for seq_key in name_map.values() {
552 let seq_digest = key_to_digest_string(seq_key);
553 for (ns, alias) in source.aliases.reverse_lookup_sequence(&seq_digest) {
554 self.add_sequence_alias(&ns, &alias, &seq_digest)?;
555 }
556 }
557 }
558
559 for (ns, alias) in source.aliases.reverse_lookup_collection(digest) {
561 self.add_collection_alias(&ns, &alias, digest)?;
562 }
563
564 if let Some(fhr) = source.get_fhr_metadata(digest) {
566 self.set_fhr_metadata(digest, fhr.clone())?;
567 }
568
569 Ok(())
570 }
571
572 pub fn list_sequences(&self) -> Vec<SequenceMetadata> {
578 let mut result: Vec<_> = self
579 .sequence_store
580 .values()
581 .map(|rec| rec.metadata().clone())
582 .collect();
583 result.sort_by(|a, b| a.sha512t24u.cmp(&b.sha512t24u));
584 result
585 }
586
587 pub fn get_sequence_metadata<K: AsRef<[u8]>>(
589 &self,
590 seq_digest: K,
591 ) -> Option<&SequenceMetadata> {
592 let key = seq_digest.to_key();
593 self.sequence_store.get(&key).map(|rec| rec.metadata())
594 }
595
596 pub fn get_sequence<K: AsRef<[u8]>>(&self, seq_digest: K) -> Result<&SequenceRecord> {
598 let digest_key = seq_digest.to_key();
599 let actual_key = self
600 .md5_lookup
601 .get(&digest_key)
602 .copied()
603 .unwrap_or(digest_key);
604 self.sequence_store.get(&actual_key).ok_or_else(|| {
605 anyhow!(
606 "Sequence not found: {}",
607 String::from_utf8_lossy(seq_digest.as_ref())
608 )
609 })
610 }
611
612 pub fn ensure_decoded<K: AsRef<[u8]>>(&mut self, seq_digest: K) -> Result<()> {
614 let digest_key = seq_digest.to_key();
615 let actual_key = self
616 .md5_lookup
617 .get(&digest_key)
618 .copied()
619 .unwrap_or(digest_key);
620
621 if self.decoded_cache.contains_key(&actual_key) {
622 return Ok(());
623 }
624
625 let record = self
626 .sequence_store
627 .get(&actual_key)
628 .ok_or_else(|| anyhow!("Sequence not found"))?;
629 let decoded = record
630 .decode()
631 .ok_or_else(|| anyhow!("Sequence not loaded (stub). Call load_sequence() first."))?;
632
633 self.decoded_cache.insert(actual_key, decoded.into_bytes());
634 Ok(())
635 }
636
637 pub fn clear_decoded_cache(&mut self) {
639 self.decoded_cache.clear();
640 }
641
642 pub fn clear(&mut self) {
644 self.sequence_store.clear();
645 self.decoded_cache.clear();
646 }
647
648 pub fn sequence_bytes<K: AsRef<[u8]>>(&self, seq_digest: K) -> Option<&[u8]> {
650 let digest_key = seq_digest.to_key();
651 let actual_key = self
652 .md5_lookup
653 .get(&digest_key)
654 .copied()
655 .unwrap_or(digest_key);
656 self.decoded_cache.get(&actual_key).map(|v| v.as_slice())
657 }
658
659 pub fn get_sequence_by_name<K: AsRef<[u8]>>(
661 &self,
662 collection_digest: K,
663 sequence_name: &str,
664 ) -> Result<&SequenceRecord> {
665 let collection_key = collection_digest.to_key();
666
667 if !self.name_lookup.contains_key(&collection_key) {
668 return Err(anyhow!(
669 "Collection not loaded. Call load_collection() or load_all_collections() first."
670 ));
671 }
672
673 let digest_key = self.name_lookup.get(&collection_key)
674 .and_then(|name_map| name_map.get(sequence_name).cloned())
675 .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", sequence_name))?;
676
677 let record = self.sequence_store.get(&digest_key).ok_or_else(|| {
678 anyhow!("Sequence record not found for '{}'. Call load_sequence() first.", sequence_name)
679 })?;
680
681 Ok(record)
682 }
683
684 pub fn load_all_collections(&mut self) -> Result<()> {
690 let keys: Vec<DigestKey> = self.collections.keys().cloned().collect();
691 for key in keys {
692 self.ensure_collection_loaded(&key)?;
693 }
694 Ok(())
695 }
696
697 pub fn load_all_sequences(&mut self) -> Result<()> {
699 let keys: Vec<DigestKey> = self.sequence_store.keys().cloned().collect();
700 for key in keys {
701 self.ensure_sequence_loaded(&key)?;
702 }
703 Ok(())
704 }
705
706 pub fn load_collection(&mut self, digest: &str) -> Result<()> {
708 let key = digest.to_key();
709 self.ensure_collection_loaded(&key)
710 }
711
712 pub fn load_sequence(&mut self, digest: &str) -> Result<()> {
714 let key = digest.to_key();
715 self.ensure_sequence_loaded(&key)
716 }
717
718 pub fn iter_collections(&self) -> impl Iterator<Item = crate::digest::SequenceCollection> + '_ {
720 let mut digests: Vec<String> = self
721 .collections
722 .values()
723 .map(|rec| rec.metadata().digest.clone())
724 .collect();
725 digests.sort();
726
727 digests.into_iter().filter_map(move |digest| {
728 self.get_collection(&digest).ok()
729 })
730 }
731
732 pub fn iter_sequences(&self) -> impl Iterator<Item = SequenceRecord> + '_ {
734 let mut records: Vec<_> = self.sequence_store.values().cloned().collect();
735 records.sort_by(|a, b| a.metadata().sha512t24u.cmp(&b.metadata().sha512t24u));
736 records.into_iter()
737 }
738
739 pub fn is_collection_loaded<K: AsRef<[u8]>>(&self, collection_digest: K) -> bool {
741 let key = collection_digest.to_key();
742 self.collections
743 .get(&key)
744 .map_or(false, |record| record.has_sequences())
745 }
746
747 pub fn local_path(&self) -> Option<&PathBuf> {
749 self.local_path.as_ref()
750 }
751
752 pub fn remote_source(&self) -> Option<&str> {
754 self.remote_source.as_deref()
755 }
756
757 pub fn storage_mode(&self) -> StorageMode {
759 self.mode
760 }
761
762 pub fn get_substring<K: AsRef<[u8]>>(
768 &self,
769 sha512_digest: K,
770 start: usize,
771 end: usize,
772 ) -> Result<String> {
773 let digest_key = sha512_digest.to_key();
774
775 let record = self.sequence_store.get(&digest_key).ok_or_else(|| {
776 anyhow!(
777 "Sequence not found: {}",
778 String::from_utf8_lossy(sha512_digest.as_ref())
779 )
780 })?;
781 let (metadata, sequence) = match record {
782 SequenceRecord::Stub(_) => return Err(anyhow!("Sequence data not loaded (stub only)")),
783 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
784 };
785
786 if start >= metadata.length || end > metadata.length || start >= end {
787 return Err(anyhow!(
788 "Invalid substring range: start={}, end={}, sequence length={}",
789 start,
790 end,
791 metadata.length
792 ));
793 }
794
795 match self.mode {
796 StorageMode::Encoded => {
797 let alphabet = lookup_alphabet(&metadata.alphabet);
798 let decoded_sequence = decode_substring_from_bytes(sequence, start, end, alphabet);
799 String::from_utf8(decoded_sequence)
800 .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
801 }
802 StorageMode::Raw => {
803 let raw_slice: &[u8] = &sequence[start..end];
804 String::from_utf8(raw_slice.to_vec())
805 .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
806 }
807 }
808 }
809
810 pub(crate) fn expand_template(digest_str: &str, template: &str) -> PathBuf {
816 debug_assert!(
817 digest_str.len() >= 4,
818 "Digest string must be at least 4 characters for template expansion, got {} chars",
819 digest_str.len()
820 );
821 let path_str = template
822 .replace("%s2", digest_str.get(0..2).unwrap_or(digest_str))
823 .replace("%s4", digest_str.get(0..4).unwrap_or(digest_str))
824 .replace("%s", digest_str);
825 PathBuf::from(path_str)
826 }
827
828 pub(crate) fn sanitize_relative_path(path: &str) -> Result<()> {
830 if path.starts_with('/') || path.starts_with('\\') {
831 return Err(anyhow!("Absolute paths not allowed: {}", path));
832 }
833 if path.contains("..") {
834 return Err(anyhow!("Directory traversal not allowed: {}", path));
835 }
836 if path.contains('\0') {
837 return Err(anyhow!("Null bytes not allowed in path"));
838 }
839 Ok(())
840 }
841
842 pub(crate) fn fetch_file(
844 local_path: &Option<PathBuf>,
845 remote_source: &Option<String>,
846 relative_path: &str,
847 persist_to_disk: bool,
848 force_refresh: bool,
849 ) -> Result<Vec<u8>> {
850 Self::sanitize_relative_path(relative_path)?;
851
852 if persist_to_disk && !force_refresh {
853 if let Some(local_path) = local_path {
854 let full_local_path = local_path.join(relative_path);
855 if full_local_path.exists() {
856 return fs::read(&full_local_path).context(format!(
857 "Failed to read local file: {}",
858 full_local_path.display()
859 ));
860 }
861 }
862 }
863
864 if let Some(remote_url) = remote_source {
865 let full_remote_url = if remote_url.ends_with('/') {
866 format!("{}{}", remote_url, relative_path)
867 } else {
868 format!("{}/{}", remote_url, relative_path)
869 };
870
871 let response = ureq::get(&full_remote_url)
872 .call()
873 .map_err(|e| anyhow!("Failed to fetch from remote: {}", e))?;
874
875 let mut data = Vec::new();
876 response
877 .into_reader()
878 .read_to_end(&mut data)
879 .context("Failed to read response body")?;
880
881 if persist_to_disk {
882 if let Some(local_path) = local_path {
883 let full_local_path = local_path.join(relative_path);
884
885 if let Some(parent) = full_local_path.parent() {
886 create_dir_all(parent)?;
887 }
888
889 fs::write(&full_local_path, &data).context(format!(
890 "Failed to cache file to: {}",
891 full_local_path.display()
892 ))?;
893 }
894 }
895
896 Ok(data)
897 } else {
898 Err(anyhow!(
899 "File not found locally and no remote source configured: {}",
900 relative_path
901 ))
902 }
903 }
904
905 pub(crate) fn ensure_collection_loaded(&mut self, collection_digest: &DigestKey) -> Result<()> {
907 if self.name_lookup.contains_key(collection_digest) {
908 return Ok(());
909 }
910
911 let needs_fetch = match self.collections.get(collection_digest) {
912 Some(SequenceCollectionRecord::Stub(_)) => true,
913 Some(SequenceCollectionRecord::Full { .. }) => false,
914 None => true,
915 };
916
917 if needs_fetch {
918 let digest_str = if let Some(SequenceCollectionRecord::Stub(meta)) =
919 self.collections.get(collection_digest)
920 {
921 meta.digest.clone()
922 } else {
923 key_to_digest_string(collection_digest)
924 };
925
926 let relative_path = format!("collections/{}.rgsi", digest_str);
927
928 if !self.quiet {
929 let cached = self
930 .local_path
931 .as_ref()
932 .map(|p| p.join(&relative_path).exists())
933 .unwrap_or(false);
934 let verb = if cached { "Loading" } else { "Downloading" };
935 eprintln!("{} collection metadata {}...", verb, digest_str);
936 }
937 let _collection_data =
938 Self::fetch_file(&self.local_path, &self.remote_source, &relative_path, true, false)?;
939
940 let local_path = self
941 .local_path
942 .as_ref()
943 .ok_or_else(|| anyhow!("No local path configured"))?;
944
945 let collection_file_path = local_path.join(&relative_path);
946
947 let collection = read_rgsi_file(&collection_file_path)?;
948
949 let loaded_digest = collection.metadata.digest.to_key();
950 if loaded_digest != *collection_digest {
951 return Err(anyhow!(
952 "Collection digest mismatch: expected {}, got {}",
953 key_to_digest_string(collection_digest),
954 key_to_digest_string(&loaded_digest)
955 ));
956 }
957
958 let mut name_map = IndexMap::new();
959 for sequence_record in &collection.sequences {
960 let metadata = sequence_record.metadata();
961 let sha512_key = metadata.sha512t24u.to_key();
962 name_map.insert(metadata.name.clone(), sha512_key);
963
964 if !self.sequence_store.contains_key(&sha512_key) {
965 self.sequence_store
966 .insert(sha512_key, SequenceRecord::Stub(metadata.clone()));
967 let md5_key = metadata.md5.to_key();
968 self.md5_lookup.insert(md5_key, sha512_key);
969 }
970 }
971 self.name_lookup.insert(*collection_digest, name_map);
972
973 let record = SequenceCollectionRecord::from(collection);
974 self.collections.insert(*collection_digest, record);
975 } else {
976 let sequences_data: Vec<(SequenceMetadata, DigestKey, DigestKey)> =
977 if let Some(SequenceCollectionRecord::Full { sequences, .. }) =
978 self.collections.get(collection_digest)
979 {
980 sequences
981 .iter()
982 .map(|seq| {
983 let metadata = seq.metadata().clone();
984 let sha512_key = metadata.sha512t24u.to_key();
985 let md5_key = metadata.md5.to_key();
986 (metadata, sha512_key, md5_key)
987 })
988 .collect()
989 } else {
990 Vec::new()
991 };
992
993 let mut name_map = IndexMap::new();
994 for (metadata, sha512_key, md5_key) in sequences_data {
995 name_map.insert(metadata.name.clone(), sha512_key);
996
997 if !self.sequence_store.contains_key(&sha512_key) {
998 self.sequence_store
999 .insert(sha512_key, SequenceRecord::Stub(metadata));
1000 self.md5_lookup.insert(md5_key, sha512_key);
1001 }
1002 }
1003 self.name_lookup.insert(*collection_digest, name_map);
1004 }
1005
1006 Ok(())
1007 }
1008
1009 pub(crate) fn ensure_sequence_loaded(&mut self, digest: &DigestKey) -> Result<()> {
1011 let record = self
1012 .sequence_store
1013 .get(digest)
1014 .ok_or_else(|| anyhow!("Sequence not found in store"))?;
1015
1016 if matches!(record, SequenceRecord::Full { .. }) {
1017 return Ok(());
1018 }
1019
1020 let digest_str = &record.metadata().sha512t24u;
1021 let template = self
1022 .seqdata_path_template
1023 .as_ref()
1024 .ok_or_else(|| anyhow!("No sequence data path template configured"))?;
1025
1026 let relative_path = Self::expand_template(digest_str, template)
1027 .to_string_lossy()
1028 .into_owned();
1029
1030 if !self.quiet {
1031 let cached = self
1032 .local_path
1033 .as_ref()
1034 .map(|p| p.join(&relative_path).exists())
1035 .unwrap_or(false);
1036 let verb = if cached { "Loading" } else { "Downloading" };
1037 eprintln!("{} sequence {}...", verb, digest_str);
1038 }
1039 let data = Self::fetch_file(
1040 &self.local_path,
1041 &self.remote_source,
1042 &relative_path,
1043 self.persist_to_disk,
1044 false,
1045 )?;
1046
1047 self.sequence_store.entry(*digest).and_modify(|r| {
1048 r.load_data(data);
1049 });
1050
1051 Ok(())
1052 }
1053
1054 pub fn write(&self) -> Result<()> {
1060 if !self.persist_to_disk {
1061 return Err(anyhow!(
1062 "write() only works with disk-backed stores - use write_store_to_dir() instead"
1063 ));
1064 }
1065 self.write_index_files()
1066 }
1067
1068 pub fn write_store_to_dir<P: AsRef<Path>>(
1070 &self,
1071 root_path: P,
1072 seqdata_path_template: Option<&str>,
1073 ) -> Result<()> {
1074 let root_path = root_path.as_ref();
1075
1076 let template = seqdata_path_template
1077 .or(self.seqdata_path_template.as_deref())
1078 .unwrap_or(DEFAULT_SEQDATA_PATH_TEMPLATE);
1079
1080 if !self.quiet {
1081 eprintln!(
1082 "Writing store to directory: {}; Using seqdata path template: {}",
1083 root_path.display(),
1084 template
1085 );
1086 }
1087
1088 fs::create_dir_all(root_path)?;
1089
1090 let sequences_dir = root_path.join("sequences");
1091 fs::create_dir_all(&sequences_dir)?;
1092
1093 let collections_dir = root_path.join("collections");
1094 fs::create_dir_all(&collections_dir)?;
1095
1096 for record in self.sequence_store.values() {
1097 match record {
1098 SequenceRecord::Full { metadata, .. } => {
1099 let rel_path = Self::expand_template(&metadata.sha512t24u, template);
1100 let full_path = root_path.join(&rel_path);
1101 record.to_file(full_path)?;
1102 }
1103 SequenceRecord::Stub(_) => {
1104 continue;
1105 }
1106 }
1107 }
1108
1109 for record in self.collections.values() {
1110 let collection_file_path =
1111 root_path.join(format!("collections/{}.rgsi", record.metadata().digest));
1112 record.write_collection_rgsi(&collection_file_path)?;
1113 }
1114
1115 let sequence_index_path = root_path.join("sequences.rgsi");
1116 self.write_sequences_rgsi(&sequence_index_path)?;
1117
1118 let collection_index_path = root_path.join("collections.rgci");
1119 self.write_collections_rgci(&collection_index_path)?;
1120
1121 let aliases_dir = root_path.join("aliases");
1122 self.aliases.write_to_dir(&aliases_dir)?;
1123
1124 super::fhr_metadata::write_sidecars(&root_path.join("fhr"), &self.fhr_metadata)?;
1125
1126 self.write_rgstore_json(root_path, template)?;
1127
1128 Ok(())
1129 }
1130
1131 pub fn stats(&self) -> StoreStats {
1133 let n_sequences = self.sequence_store.len();
1134 let n_sequences_loaded = self
1135 .sequence_store
1136 .values()
1137 .filter(|record| record.is_loaded())
1138 .count();
1139 let n_collections = self.collections.len();
1140 let n_collections_loaded = self
1141 .collections
1142 .values()
1143 .filter(|record| record.has_sequences())
1144 .count();
1145 let mode_str = match self.mode {
1146 StorageMode::Raw => "Raw",
1147 StorageMode::Encoded => "Encoded",
1148 };
1149 StoreStats {
1150 n_sequences,
1151 n_sequences_loaded,
1152 n_collections,
1153 n_collections_loaded,
1154 storage_mode: mode_str.to_string(),
1155 }
1156 }
1157
1158 pub fn available_alias_namespaces(&self) -> AvailableAliases<'_> {
1160 AvailableAliases {
1161 sequences: &self.available_sequence_alias_namespaces,
1162 collections: &self.available_collection_alias_namespaces,
1163 }
1164 }
1165}
1166
1167impl Display for ReadonlyRefgetStore {
1168 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1169 let total_size = self.total_disk_size();
1170 let size_str = format_bytes(total_size);
1171 writeln!(f, "ReadonlyRefgetStore object:")?;
1172 writeln!(f, " Mode: {:?}", self.mode)?;
1173 writeln!(f, " Disk size: {} ({} bytes)", size_str, total_size)?;
1174 writeln!(f, ">Sequences (n={}):", self.sequence_store.len())?;
1175 for (i, (sha512_digest, sequence_record)) in self.sequence_store.iter().take(10).enumerate()
1176 {
1177 let metadata = sequence_record.metadata();
1178 let first_8_chars = match sequence_record {
1179 SequenceRecord::Stub(_) => "<stub>".to_string(),
1180 SequenceRecord::Full {
1181 metadata,
1182 sequence: seq,
1183 } => {
1184 match self.mode {
1185 StorageMode::Encoded => {
1186 let alphabet = lookup_alphabet(&metadata.alphabet);
1187 let decoded = decode_substring_from_bytes(
1188 seq,
1189 0,
1190 8.min(metadata.length),
1191 alphabet,
1192 );
1193 String::from_utf8(decoded).unwrap_or_else(|_| "???".to_string())
1194 }
1195 StorageMode::Raw => String::from_utf8(seq[0..8.min(seq.len())].to_vec())
1196 .unwrap_or_else(|_| "???".to_string()),
1197 }
1198 }
1199 };
1200
1201 writeln!(
1202 f,
1203 " - {}. {:02x?}, MD5: {:02x?}, Length: {}, Alphabet: {:?}, Start: {}",
1204 i + 1,
1205 key_to_digest_string(sha512_digest),
1206 &metadata.md5,
1207 &metadata.length,
1208 &metadata.alphabet,
1209 first_8_chars
1210 )?;
1211 }
1212 writeln!(f, ">Collections (n={:?}):", self.name_lookup.len())?;
1213 for (i, (digest, name_map)) in self.name_lookup.iter().enumerate() {
1214 let seqcol_digest_str = key_to_digest_string(digest);
1215 writeln!(
1216 f,
1217 " {}. Collection Digest: {:02x?} ({} sequences)",
1218 i + 1,
1219 seqcol_digest_str,
1220 name_map.len()
1221 )?;
1222 for (name, sha512_digest) in name_map.iter().take(5) {
1223 let sha512_str = key_to_digest_string(sha512_digest);
1224 writeln!(f, " - Name: {}, SHA512: {:02x?}", name, sha512_str)?;
1225 }
1226 if name_map.len() > 5 {
1227 writeln!(f, " - ... and {} more", name_map.len() - 5)?;
1228 }
1229 }
1230
1231 Ok(())
1232 }
1233}
1234
1235use crate::collection::SequenceCollectionRecordExt;