Skip to main content

gtars_refget/store/
readonly.rs

1//! ReadonlyRefgetStore struct definition and core methods.
2
3use super::*;
4use super::alias::AliasManager;
5
6use std::collections::HashMap;
7use std::fmt::{Display, Formatter};
8use std::path::{Path, PathBuf};
9
10use indexmap::IndexMap;
11
12use anyhow::{anyhow, Context, Result};
13
14use crate::collection::{read_rgsi_file, SequenceMetadataExt, SequenceRecordExt};
15use crate::digest::lookup_alphabet;
16use crate::digest::{
17    SequenceCollectionMetadata, SequenceCollectionRecord, SequenceMetadata,
18    SequenceRecord,
19};
20use crate::digest::{decode_string_from_bytes, decode_substring_from_bytes, encode_sequence};
21use crate::hashkeyable::{DigestKey, HashKeyable, key_to_digest_string};
22use crate::seqcol::metadata_matches_attribute;
23
24use std::fs::{self, create_dir_all};
25
26/// Core refget store with `&self` read methods, suitable for `Arc` sharing in servers.
27///
28/// Mutating methods are used during the setup/loading phase; once wrapped in `Arc`,
29/// only `&self` reads are accessible, making concurrent access thread-safe.
30///
31/// Holds a global sequence_store with all sequences (across collections) deduplicated.
32/// This allows lookup by sequence digest directly (bypassing collection information).
33/// Also holds a collections hashmap, to provide lookup by collection+name.
34#[derive(Debug)]
35pub struct ReadonlyRefgetStore {
36    /// SHA512t24u digest -> SequenceRecord (metadata + optional data)
37    pub(crate) sequence_store: HashMap<DigestKey, SequenceRecord>,
38    /// MD5 digest -> SHA512t24u digest lookup
39    pub(crate) md5_lookup: HashMap<DigestKey, DigestKey>,
40
41    /// Collection digest -> {name -> SHA512t24u digest} (IndexMap preserves FASTA insertion order)
42    pub(crate) name_lookup: HashMap<DigestKey, IndexMap<String, DigestKey>>,
43    /// Active sequence collections (now using SequenceCollectionRecord for Stub/Full pattern)
44    pub(crate) collections: HashMap<DigestKey, SequenceCollectionRecord>,
45    /// Storage strategy for sequences
46    pub(crate) mode: StorageMode,
47    /// Where the store lives on disk (local store or cache directory)
48    pub(crate) local_path: Option<PathBuf>,
49    /// Where to pull sequences from (if remote-backed)
50    pub(crate) remote_source: Option<String>,
51    /// Template for sequence file paths (e.g., "sequences/%s2/%s.seq")
52    pub(crate) seqdata_path_template: Option<String>,
53    /// Whether to persist sequences to disk (write-through caching)
54    pub(crate) persist_to_disk: bool,
55    /// Whether to suppress progress output
56    pub(crate) quiet: bool,
57    /// Whether to compute ancillary digests (nlp, snlp, sorted_sequences).
58    /// Default: true for new stores.
59    pub(crate) ancillary_digests: bool,
60    /// Whether on-disk attribute reverse index is enabled.
61    /// Default: false. Part 2 implements the indexed path.
62    pub(crate) attribute_index: bool,
63    /// Human-readable aliases for sequences and collections.
64    pub(crate) aliases: AliasManager,
65    /// FHR metadata for collections, keyed by collection digest.
66    pub(crate) fhr_metadata: HashMap<DigestKey, super::fhr_metadata::FhrMetadata>,
67    /// Available sequence alias namespaces (from manifest, for remote discovery).
68    pub(crate) available_sequence_alias_namespaces: Vec<String>,
69    /// Available collection alias namespaces (from manifest, for remote discovery).
70    pub(crate) available_collection_alias_namespaces: Vec<String>,
71    /// Cache of decoded sequence bytes, keyed by SHA512t24u digest.
72    /// Populated by ensure_decoded(), read by sequence_bytes().
73    pub(crate) decoded_cache: HashMap<DigestKey, Vec<u8>>,
74}
75
76impl ReadonlyRefgetStore {
77    /// Generic constructor. Creates a new, empty `ReadonlyRefgetStore`.
78    /// Internal only - users should go through RefgetStore.
79    pub(crate) fn new(mode: StorageMode) -> Self {
80        ReadonlyRefgetStore {
81            sequence_store: HashMap::new(),
82            md5_lookup: HashMap::new(),
83            name_lookup: HashMap::new(),
84            collections: HashMap::new(),
85            mode,
86            local_path: None,
87            remote_source: None,
88            seqdata_path_template: None,
89            persist_to_disk: false,
90            quiet: false,
91            ancillary_digests: true,
92            attribute_index: false,
93            aliases: AliasManager::default(),
94            fhr_metadata: HashMap::new(),
95            decoded_cache: HashMap::new(),
96            available_sequence_alias_namespaces: Vec::new(),
97            available_collection_alias_namespaces: Vec::new(),
98        }
99    }
100
101    /// Set whether to suppress progress output.
102    pub fn set_quiet(&mut self, quiet: bool) {
103        self.quiet = quiet;
104    }
105
106    /// Returns whether the store is in quiet mode.
107    pub fn is_quiet(&self) -> bool {
108        self.quiet
109    }
110
111    /// Check whether a valid RefgetStore exists at the given path.
112    pub fn store_exists<P: AsRef<Path>>(path: P) -> bool {
113        path.as_ref().join("rgstore.json").exists()
114    }
115
116    /// Change the storage mode, re-encoding/decoding existing sequences as needed.
117    pub fn set_encoding_mode(&mut self, new_mode: StorageMode) {
118        if self.mode == new_mode {
119            return;
120        }
121
122        for record in self.sequence_store.values_mut() {
123            match record {
124                SequenceRecord::Full { metadata, sequence } => {
125                    match (self.mode, new_mode) {
126                        (StorageMode::Raw, StorageMode::Encoded) => {
127                            let alphabet = lookup_alphabet(&metadata.alphabet);
128                            *sequence = encode_sequence(&*sequence, alphabet);
129                        }
130                        (StorageMode::Encoded, StorageMode::Raw) => {
131                            let alphabet = lookup_alphabet(&metadata.alphabet);
132                            *sequence =
133                                decode_string_from_bytes(&*sequence, metadata.length, alphabet);
134                        }
135                        _ => {}
136                    }
137                }
138                SequenceRecord::Stub(_) => {}
139            }
140        }
141
142        self.mode = new_mode;
143    }
144
145    /// Enable 2-bit encoding for space efficiency.
146    pub fn enable_encoding(&mut self) {
147        self.set_encoding_mode(StorageMode::Encoded);
148    }
149
150    /// Disable encoding, use raw byte storage.
151    pub fn disable_encoding(&mut self) {
152        self.set_encoding_mode(StorageMode::Raw);
153    }
154
155    /// Enable disk persistence for this store.
156    pub fn enable_persistence<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
157        let path = path.as_ref();
158
159        self.local_path = Some(path.to_path_buf());
160        self.persist_to_disk = true;
161        self.seqdata_path_template
162            .get_or_insert_with(|| DEFAULT_SEQDATA_PATH_TEMPLATE.to_string());
163
164        create_dir_all(path.join("sequences"))?;
165        create_dir_all(path.join("collections"))?;
166
167        let keys: Vec<DigestKey> = self.sequence_store.keys().cloned().collect();
168        for key in keys {
169            if let Some(SequenceRecord::Full { metadata, sequence }) = self.sequence_store.get(&key)
170            {
171                self.write_sequence_to_disk_single(metadata, sequence)?;
172                let stub = SequenceRecord::Stub(metadata.clone());
173                self.sequence_store.insert(key, stub);
174            }
175        }
176
177        for record in self.collections.values() {
178            self.write_collection_to_disk_single(record)?;
179        }
180
181        self.write_index_files()?;
182
183        Ok(())
184    }
185
186    /// Disable disk persistence for this store.
187    pub fn disable_persistence(&mut self) {
188        self.persist_to_disk = false;
189    }
190
191    /// Check if persistence to disk is enabled.
192    pub fn is_persisting(&self) -> bool {
193        self.persist_to_disk
194    }
195
196    /// Adds a sequence to the Store
197    pub fn add_sequence<T: Into<Option<DigestKey>>>(
198        &mut self,
199        sequence_record: SequenceRecord,
200        collection_digest: T,
201        force: bool,
202    ) -> Result<()> {
203        let collection_digest = collection_digest
204            .into()
205            .ok_or_else(|| anyhow::anyhow!("Collection digest is required"))?;
206        self.collections.get(&collection_digest).ok_or_else(|| {
207            anyhow::anyhow!("Collection not found for digest: {:?}", collection_digest)
208        })?;
209
210        let metadata = sequence_record.metadata();
211
212        self.name_lookup
213            .entry(collection_digest)
214            .or_default()
215            .insert(metadata.name.clone(), metadata.sha512t24u.to_key());
216
217        self.add_sequence_record(sequence_record, force)?;
218
219        Ok(())
220    }
221
222    /// Adds a collection, and all sequences in it, to the store.
223    pub fn add_sequence_collection(
224        &mut self,
225        collection: crate::digest::SequenceCollection,
226    ) -> Result<()> {
227        self.add_sequence_collection_internal(collection, false)
228    }
229
230    /// Adds a collection, overwriting existing data.
231    pub fn add_sequence_collection_force(
232        &mut self,
233        collection: crate::digest::SequenceCollection,
234    ) -> Result<()> {
235        self.add_sequence_collection_internal(collection, true)
236    }
237
238    /// Internal implementation for adding a sequence collection.
239    pub(crate) fn add_sequence_collection_internal(
240        &mut self,
241        collection: crate::digest::SequenceCollection,
242        force: bool,
243    ) -> Result<()> {
244        let coll_digest = collection.metadata.digest.to_key();
245
246        if !force && self.collections.contains_key(&coll_digest) {
247            return Ok(());
248        }
249
250        let crate::digest::SequenceCollection { metadata, sequences } = collection;
251
252        let record = SequenceCollectionRecord::Full {
253            metadata: metadata.clone(),
254            sequences: sequences.iter().map(|s| SequenceRecord::Stub(s.metadata().clone())).collect(),
255        };
256
257        if self.persist_to_disk && self.local_path.is_some() {
258            self.write_collection_to_disk_single(&record)?;
259        }
260
261        self.collections.insert(coll_digest, record);
262
263        for sequence_record in sequences {
264            self.add_sequence(sequence_record, coll_digest, force)?;
265        }
266
267        if self.persist_to_disk && self.local_path.is_some() {
268            self.write_index_files()?;
269        }
270
271        Ok(())
272    }
273
274    /// Adds a SequenceRecord directly to the store without collection association.
275    pub fn add_sequence_record(&mut self, sr: SequenceRecord, force: bool) -> Result<()> {
276        let metadata = sr.metadata();
277        let key = metadata.sha512t24u.to_key();
278
279        if !force && self.sequence_store.contains_key(&key) {
280            return Ok(());
281        }
282
283        self.md5_lookup
284            .insert(metadata.md5.to_key(), metadata.sha512t24u.to_key());
285
286        if self.persist_to_disk && self.local_path.is_some() {
287            match &sr {
288                SequenceRecord::Full { metadata, sequence } => {
289                    self.write_sequence_to_disk_single(metadata, sequence)?;
290                    let stub = SequenceRecord::Stub(metadata.clone());
291                    self.sequence_store.insert(key, stub);
292                    return Ok(());
293                }
294                SequenceRecord::Stub(_) => {}
295            }
296        }
297
298        self.sequence_store.insert(key, sr);
299        Ok(())
300    }
301
302    // =========================================================================
303    // Sequence query methods
304    // =========================================================================
305
306    /// Returns an iterator over all sequence digests in the store
307    pub fn sequence_digests(&self) -> impl Iterator<Item = DigestKey> + '_ {
308        self.sequence_store.keys().cloned()
309    }
310
311    /// Returns an iterator over sequence metadata for all sequences in the store.
312    pub fn sequence_metadata(&self) -> impl Iterator<Item = &SequenceMetadata> + '_ {
313        self.sequence_store.values().map(|rec| rec.metadata())
314    }
315
316    /// Calculate the total disk size of all sequences in the store
317    pub fn total_disk_size(&self) -> usize {
318        self.sequence_store
319            .values()
320            .map(|rec| rec.metadata().disk_size(&self.mode))
321            .sum()
322    }
323
324    /// Returns the actual disk usage of the store directory.
325    pub fn actual_disk_usage(&self) -> usize {
326        let Some(path) = &self.local_path else {
327            return 0;
328        };
329
330        fn dir_size(path: &std::path::Path) -> usize {
331            let mut total = 0;
332            if let Ok(entries) = std::fs::read_dir(path) {
333                for entry in entries.flatten() {
334                    let path = entry.path();
335                    if path.is_file() {
336                        total += entry.metadata().map(|m| m.len() as usize).unwrap_or(0);
337                    } else if path.is_dir() {
338                        total += dir_size(&path);
339                    }
340                }
341            }
342            total
343        }
344
345        dir_size(path)
346    }
347
348    // =========================================================================
349    // Collection API
350    // =========================================================================
351
352    /// List collections with pagination and optional attribute filtering.
353    pub fn list_collections(
354        &self,
355        page: usize,
356        page_size: usize,
357        filters: &[(&str, &str)],
358    ) -> Result<PagedResult<SequenceCollectionMetadata>> {
359        let mut filtered: Vec<SequenceCollectionMetadata> = Vec::new();
360        for record in self.collections.values() {
361            let meta = record.metadata();
362            let mut passes = true;
363            for &(attr_name, attr_digest) in filters {
364                if !metadata_matches_attribute(meta, attr_name, attr_digest)? {
365                    passes = false;
366                    break;
367                }
368            }
369            if passes {
370                filtered.push(meta.clone());
371            }
372        }
373
374        filtered.sort_by(|a, b| a.digest.cmp(&b.digest));
375
376        let total = filtered.len();
377        let start = page * page_size;
378        let results = if start < total {
379            filtered.into_iter().skip(start).take(page_size).collect()
380        } else {
381            Vec::new()
382        };
383
384        Ok(PagedResult {
385            results,
386            pagination: Pagination {
387                page,
388                page_size,
389                total,
390            },
391        })
392    }
393
394    /// Get metadata for a single collection by digest (no sequence data).
395    pub fn get_collection_metadata<K: AsRef<[u8]>>(
396        &self,
397        collection_digest: K,
398    ) -> Option<&SequenceCollectionMetadata> {
399        let key = collection_digest.to_key();
400        self.collections.get(&key).map(|record| record.metadata())
401    }
402
403    /// Get a collection with all its sequences loaded.
404    pub fn get_collection(&self, collection_digest: &str) -> Result<crate::digest::SequenceCollection> {
405        let key = collection_digest.to_key();
406
407        if !self.name_lookup.contains_key(&key) {
408            return Err(anyhow!(
409                "Collection not loaded: {}. Call load_collection() or load_all_collections() first.",
410                collection_digest
411            ));
412        }
413
414        let metadata = self
415            .collections
416            .get(&key)
417            .ok_or_else(|| anyhow!("Collection not found: {}", collection_digest))?
418            .metadata()
419            .clone();
420
421        // Iterate name_lookup for (name, digest) pairs so each record gets the
422        // correct per-collection name, not the last-written global name.
423        let sequences: Vec<SequenceRecord> = self
424            .name_lookup
425            .get(&key)
426            .map(|name_map| {
427                name_map
428                    .iter()
429                    .filter_map(|(name, seq_key)| {
430                        let record = self.sequence_store.get(seq_key)?;
431                        let mut meta = record.metadata().clone();
432                        meta.name = name.clone();
433                        Some(match record.sequence() {
434                            Some(seq) => SequenceRecord::Full {
435                                metadata: meta,
436                                sequence: seq.to_vec(),
437                            },
438                            None => SequenceRecord::Stub(meta),
439                        })
440                    })
441                    .collect()
442            })
443            .unwrap_or_default();
444
445        Ok(crate::digest::SequenceCollection {
446            metadata,
447            sequences,
448        })
449    }
450
451    /// Remove a collection from the store.
452    pub fn remove_collection(
453        &mut self,
454        digest: &str,
455        remove_orphan_sequences: bool,
456    ) -> Result<bool> {
457        let key = digest.to_key();
458
459        if self.collections.remove(&key).is_none() {
460            return Ok(false);
461        }
462
463        let orphan_candidates: Vec<DigestKey> = self
464            .name_lookup
465            .get(&key)
466            .map(|name_map| name_map.values().cloned().collect())
467            .unwrap_or_default();
468
469        self.name_lookup.remove(&key);
470        self.fhr_metadata.remove(&key);
471
472        // Remove collection aliases pointing to this digest
473        let alias_pairs = self.aliases.reverse_lookup_collection(digest);
474        let affected_namespaces: std::collections::HashSet<String> = alias_pairs
475            .iter()
476            .map(|(ns, _)| ns.clone())
477            .collect();
478        for (ns, alias) in &alias_pairs {
479            self.aliases.remove_collection(ns, alias);
480        }
481        for ns in &affected_namespaces {
482            self.persist_alias_namespace(AliasKind::Collection, ns)?;
483        }
484
485        if remove_orphan_sequences && !orphan_candidates.is_empty() {
486            let mut still_referenced: std::collections::HashSet<DigestKey> =
487                std::collections::HashSet::new();
488            for name_map in self.name_lookup.values() {
489                for seq_key in name_map.values() {
490                    still_referenced.insert(*seq_key);
491                }
492            }
493
494            let orphans: Vec<DigestKey> = orphan_candidates
495                .into_iter()
496                .filter(|k| !still_referenced.contains(k))
497                .collect();
498
499            for orphan_key in &orphans {
500                self.sequence_store.remove(orphan_key);
501                self.md5_lookup.retain(|_, v| v != orphan_key);
502                self.decoded_cache.remove(orphan_key);
503            }
504
505            if self.persist_to_disk {
506                if let (Some(local_path), Some(template)) =
507                    (&self.local_path, &self.seqdata_path_template)
508                {
509                    for orphan_key in &orphans {
510                        let orphan_digest = key_to_digest_string(orphan_key);
511                        let seq_file_path = Self::expand_template(&orphan_digest, template);
512                        let full_path = local_path.join(&seq_file_path);
513                        let _ = fs::remove_file(&full_path);
514                        if let Some(parent) = full_path.parent() {
515                            let _ = fs::remove_dir(parent);
516                        }
517                    }
518                }
519            }
520        }
521
522        if self.persist_to_disk {
523            if let Some(local_path) = &self.local_path {
524                let rgsi_path = local_path.join(format!("collections/{}.rgsi", digest));
525                let _ = fs::remove_file(&rgsi_path);
526                let fhr_path = local_path.join(format!("fhr/{}.fhr.json", digest));
527                let _ = fs::remove_file(&fhr_path);
528            }
529            self.write_index_files()?;
530        }
531
532        Ok(true)
533    }
534
535    // =========================================================================
536    // Import from another store
537    // =========================================================================
538
539    /// Import a single collection (with all its sequences, aliases, and FHR
540    /// metadata) from another store into this store.
541    ///
542    /// The source store must have the collection loaded (call
543    /// `load_collection()` or `load_all_collections()` first).
544    pub fn import_collection(&mut self, source: &ReadonlyRefgetStore, digest: &str) -> Result<()> {
545        let collection = source.get_collection(digest)?;
546        self.add_sequence_collection(collection)?;
547
548        // Copy sequence aliases for every sequence in the imported collection
549        let coll_key = digest.to_key();
550        if let Some(name_map) = source.name_lookup.get(&coll_key) {
551            for seq_key in name_map.values() {
552                let seq_digest = key_to_digest_string(seq_key);
553                for (ns, alias) in source.aliases.reverse_lookup_sequence(&seq_digest) {
554                    self.add_sequence_alias(&ns, &alias, &seq_digest)?;
555                }
556            }
557        }
558
559        // Copy collection aliases
560        for (ns, alias) in source.aliases.reverse_lookup_collection(digest) {
561            self.add_collection_alias(&ns, &alias, digest)?;
562        }
563
564        // Copy FHR metadata
565        if let Some(fhr) = source.get_fhr_metadata(digest) {
566            self.set_fhr_metadata(digest, fhr.clone())?;
567        }
568
569        Ok(())
570    }
571
572    // =========================================================================
573    // Sequence API
574    // =========================================================================
575
576    /// List all sequences in the store (metadata only, no sequence data).
577    pub fn list_sequences(&self) -> Vec<SequenceMetadata> {
578        let mut result: Vec<_> = self
579            .sequence_store
580            .values()
581            .map(|rec| rec.metadata().clone())
582            .collect();
583        result.sort_by(|a, b| a.sha512t24u.cmp(&b.sha512t24u));
584        result
585    }
586
587    /// Get metadata for a single sequence by digest (no sequence data).
588    pub fn get_sequence_metadata<K: AsRef<[u8]>>(
589        &self,
590        seq_digest: K,
591    ) -> Option<&SequenceMetadata> {
592        let key = seq_digest.to_key();
593        self.sequence_store.get(&key).map(|rec| rec.metadata())
594    }
595
596    /// Get a sequence by its SHA512t24u digest.
597    pub fn get_sequence<K: AsRef<[u8]>>(&self, seq_digest: K) -> Result<&SequenceRecord> {
598        let digest_key = seq_digest.to_key();
599        let actual_key = self
600            .md5_lookup
601            .get(&digest_key)
602            .copied()
603            .unwrap_or(digest_key);
604        self.sequence_store.get(&actual_key).ok_or_else(|| {
605            anyhow!(
606                "Sequence not found: {}",
607                String::from_utf8_lossy(seq_digest.as_ref())
608            )
609        })
610    }
611
612    /// Ensure a sequence is loaded and decoded into the decoded cache.
613    pub fn ensure_decoded<K: AsRef<[u8]>>(&mut self, seq_digest: K) -> Result<()> {
614        let digest_key = seq_digest.to_key();
615        let actual_key = self
616            .md5_lookup
617            .get(&digest_key)
618            .copied()
619            .unwrap_or(digest_key);
620
621        if self.decoded_cache.contains_key(&actual_key) {
622            return Ok(());
623        }
624
625        let record = self
626            .sequence_store
627            .get(&actual_key)
628            .ok_or_else(|| anyhow!("Sequence not found"))?;
629        let decoded = record
630            .decode()
631            .ok_or_else(|| anyhow!("Sequence not loaded (stub). Call load_sequence() first."))?;
632
633        self.decoded_cache.insert(actual_key, decoded.into_bytes());
634        Ok(())
635    }
636
637    /// Clear the decoded sequence cache to reclaim memory.
638    pub fn clear_decoded_cache(&mut self) {
639        self.decoded_cache.clear();
640    }
641
642    /// Clear sequence data from the store to free memory.
643    pub fn clear(&mut self) {
644        self.sequence_store.clear();
645        self.decoded_cache.clear();
646    }
647
648    /// Get decoded sequence bytes from the cache.
649    pub fn sequence_bytes<K: AsRef<[u8]>>(&self, seq_digest: K) -> Option<&[u8]> {
650        let digest_key = seq_digest.to_key();
651        let actual_key = self
652            .md5_lookup
653            .get(&digest_key)
654            .copied()
655            .unwrap_or(digest_key);
656        self.decoded_cache.get(&actual_key).map(|v| v.as_slice())
657    }
658
659    /// Get a sequence by collection digest and name.
660    pub fn get_sequence_by_name<K: AsRef<[u8]>>(
661        &self,
662        collection_digest: K,
663        sequence_name: &str,
664    ) -> Result<&SequenceRecord> {
665        let collection_key = collection_digest.to_key();
666
667        if !self.name_lookup.contains_key(&collection_key) {
668            return Err(anyhow!(
669                "Collection not loaded. Call load_collection() or load_all_collections() first."
670            ));
671        }
672
673        let digest_key = self.name_lookup.get(&collection_key)
674            .and_then(|name_map| name_map.get(sequence_name).cloned())
675            .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", sequence_name))?;
676
677        let record = self.sequence_store.get(&digest_key).ok_or_else(|| {
678            anyhow!("Sequence record not found for '{}'. Call load_sequence() first.", sequence_name)
679        })?;
680
681        Ok(record)
682    }
683
684    // =========================================================================
685    // Loading methods
686    // =========================================================================
687
688    /// Eagerly load all Stub collections to Full.
689    pub fn load_all_collections(&mut self) -> Result<()> {
690        let keys: Vec<DigestKey> = self.collections.keys().cloned().collect();
691        for key in keys {
692            self.ensure_collection_loaded(&key)?;
693        }
694        Ok(())
695    }
696
697    /// Eagerly load all Stub sequences to Full.
698    pub fn load_all_sequences(&mut self) -> Result<()> {
699        let keys: Vec<DigestKey> = self.sequence_store.keys().cloned().collect();
700        for key in keys {
701            self.ensure_sequence_loaded(&key)?;
702        }
703        Ok(())
704    }
705
706    /// Load a single collection by digest.
707    pub fn load_collection(&mut self, digest: &str) -> Result<()> {
708        let key = digest.to_key();
709        self.ensure_collection_loaded(&key)
710    }
711
712    /// Load a single sequence by digest.
713    pub fn load_sequence(&mut self, digest: &str) -> Result<()> {
714        let key = digest.to_key();
715        self.ensure_sequence_loaded(&key)
716    }
717
718    /// Iterate over all collections with their sequences loaded.
719    pub fn iter_collections(&self) -> impl Iterator<Item = crate::digest::SequenceCollection> + '_ {
720        let mut digests: Vec<String> = self
721            .collections
722            .values()
723            .map(|rec| rec.metadata().digest.clone())
724            .collect();
725        digests.sort();
726
727        digests.into_iter().filter_map(move |digest| {
728            self.get_collection(&digest).ok()
729        })
730    }
731
732    /// Iterate over all sequences with their data loaded.
733    pub fn iter_sequences(&self) -> impl Iterator<Item = SequenceRecord> + '_ {
734        let mut records: Vec<_> = self.sequence_store.values().cloned().collect();
735        records.sort_by(|a, b| a.metadata().sha512t24u.cmp(&b.metadata().sha512t24u));
736        records.into_iter()
737    }
738
739    /// Check if a collection is fully loaded.
740    pub fn is_collection_loaded<K: AsRef<[u8]>>(&self, collection_digest: K) -> bool {
741        let key = collection_digest.to_key();
742        self.collections
743            .get(&key)
744            .map_or(false, |record| record.has_sequences())
745    }
746
747    /// Returns the local path where the store is located (if any)
748    pub fn local_path(&self) -> Option<&PathBuf> {
749        self.local_path.as_ref()
750    }
751
752    /// Returns the remote source URL (if any)
753    pub fn remote_source(&self) -> Option<&str> {
754        self.remote_source.as_deref()
755    }
756
757    /// Returns the storage mode used by this store
758    pub fn storage_mode(&self) -> StorageMode {
759        self.mode
760    }
761
762    // =========================================================================
763    // Substring retrieval
764    // =========================================================================
765
766    /// Retrieves a substring from an encoded sequence by its SHA512t24u digest.
767    pub fn get_substring<K: AsRef<[u8]>>(
768        &self,
769        sha512_digest: K,
770        start: usize,
771        end: usize,
772    ) -> Result<String> {
773        let digest_key = sha512_digest.to_key();
774
775        let record = self.sequence_store.get(&digest_key).ok_or_else(|| {
776            anyhow!(
777                "Sequence not found: {}",
778                String::from_utf8_lossy(sha512_digest.as_ref())
779            )
780        })?;
781        let (metadata, sequence) = match record {
782            SequenceRecord::Stub(_) => return Err(anyhow!("Sequence data not loaded (stub only)")),
783            SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
784        };
785
786        if start >= metadata.length || end > metadata.length || start >= end {
787            return Err(anyhow!(
788                "Invalid substring range: start={}, end={}, sequence length={}",
789                start,
790                end,
791                metadata.length
792            ));
793        }
794
795        match self.mode {
796            StorageMode::Encoded => {
797                let alphabet = lookup_alphabet(&metadata.alphabet);
798                let decoded_sequence = decode_substring_from_bytes(sequence, start, end, alphabet);
799                String::from_utf8(decoded_sequence)
800                    .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
801            }
802            StorageMode::Raw => {
803                let raw_slice: &[u8] = &sequence[start..end];
804                String::from_utf8(raw_slice.to_vec())
805                    .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
806            }
807        }
808    }
809
810    // =========================================================================
811    // Internal helpers
812    // =========================================================================
813
814    /// Expand a path template by substituting digest-based placeholders.
815    pub(crate) fn expand_template(digest_str: &str, template: &str) -> PathBuf {
816        debug_assert!(
817            digest_str.len() >= 4,
818            "Digest string must be at least 4 characters for template expansion, got {} chars",
819            digest_str.len()
820        );
821        let path_str = template
822            .replace("%s2", digest_str.get(0..2).unwrap_or(digest_str))
823            .replace("%s4", digest_str.get(0..4).unwrap_or(digest_str))
824            .replace("%s", digest_str);
825        PathBuf::from(path_str)
826    }
827
828    /// Validate a relative path to prevent directory traversal attacks.
829    pub(crate) fn sanitize_relative_path(path: &str) -> Result<()> {
830        if path.starts_with('/') || path.starts_with('\\') {
831            return Err(anyhow!("Absolute paths not allowed: {}", path));
832        }
833        if path.contains("..") {
834            return Err(anyhow!("Directory traversal not allowed: {}", path));
835        }
836        if path.contains('\0') {
837            return Err(anyhow!("Null bytes not allowed in path"));
838        }
839        Ok(())
840    }
841
842    /// Helper function to fetch a file from local path or remote source
843    pub(crate) fn fetch_file(
844        local_path: &Option<PathBuf>,
845        remote_source: &Option<String>,
846        relative_path: &str,
847        persist_to_disk: bool,
848        force_refresh: bool,
849    ) -> Result<Vec<u8>> {
850        Self::sanitize_relative_path(relative_path)?;
851
852        if persist_to_disk && !force_refresh {
853            if let Some(local_path) = local_path {
854                let full_local_path = local_path.join(relative_path);
855                if full_local_path.exists() {
856                    return fs::read(&full_local_path).context(format!(
857                        "Failed to read local file: {}",
858                        full_local_path.display()
859                    ));
860                }
861            }
862        }
863
864        if let Some(remote_url) = remote_source {
865            let full_remote_url = if remote_url.ends_with('/') {
866                format!("{}{}", remote_url, relative_path)
867            } else {
868                format!("{}/{}", remote_url, relative_path)
869            };
870
871            let response = ureq::get(&full_remote_url)
872                .call()
873                .map_err(|e| anyhow!("Failed to fetch from remote: {}", e))?;
874
875            let mut data = Vec::new();
876            response
877                .into_reader()
878                .read_to_end(&mut data)
879                .context("Failed to read response body")?;
880
881            if persist_to_disk {
882                if let Some(local_path) = local_path {
883                    let full_local_path = local_path.join(relative_path);
884
885                    if let Some(parent) = full_local_path.parent() {
886                        create_dir_all(parent)?;
887                    }
888
889                    fs::write(&full_local_path, &data).context(format!(
890                        "Failed to cache file to: {}",
891                        full_local_path.display()
892                    ))?;
893                }
894            }
895
896            Ok(data)
897        } else {
898            Err(anyhow!(
899                "File not found locally and no remote source configured: {}",
900                relative_path
901            ))
902        }
903    }
904
905    /// Ensure a collection is loaded into the store
906    pub(crate) fn ensure_collection_loaded(&mut self, collection_digest: &DigestKey) -> Result<()> {
907        if self.name_lookup.contains_key(collection_digest) {
908            return Ok(());
909        }
910
911        let needs_fetch = match self.collections.get(collection_digest) {
912            Some(SequenceCollectionRecord::Stub(_)) => true,
913            Some(SequenceCollectionRecord::Full { .. }) => false,
914            None => true,
915        };
916
917        if needs_fetch {
918            let digest_str = if let Some(SequenceCollectionRecord::Stub(meta)) =
919                self.collections.get(collection_digest)
920            {
921                meta.digest.clone()
922            } else {
923                key_to_digest_string(collection_digest)
924            };
925
926            let relative_path = format!("collections/{}.rgsi", digest_str);
927
928            if !self.quiet {
929                let cached = self
930                    .local_path
931                    .as_ref()
932                    .map(|p| p.join(&relative_path).exists())
933                    .unwrap_or(false);
934                let verb = if cached { "Loading" } else { "Downloading" };
935                eprintln!("{} collection metadata {}...", verb, digest_str);
936            }
937            let _collection_data =
938                Self::fetch_file(&self.local_path, &self.remote_source, &relative_path, true, false)?;
939
940            let local_path = self
941                .local_path
942                .as_ref()
943                .ok_or_else(|| anyhow!("No local path configured"))?;
944
945            let collection_file_path = local_path.join(&relative_path);
946
947            let collection = read_rgsi_file(&collection_file_path)?;
948
949            let loaded_digest = collection.metadata.digest.to_key();
950            if loaded_digest != *collection_digest {
951                return Err(anyhow!(
952                    "Collection digest mismatch: expected {}, got {}",
953                    key_to_digest_string(collection_digest),
954                    key_to_digest_string(&loaded_digest)
955                ));
956            }
957
958            let mut name_map = IndexMap::new();
959            for sequence_record in &collection.sequences {
960                let metadata = sequence_record.metadata();
961                let sha512_key = metadata.sha512t24u.to_key();
962                name_map.insert(metadata.name.clone(), sha512_key);
963
964                if !self.sequence_store.contains_key(&sha512_key) {
965                    self.sequence_store
966                        .insert(sha512_key, SequenceRecord::Stub(metadata.clone()));
967                    let md5_key = metadata.md5.to_key();
968                    self.md5_lookup.insert(md5_key, sha512_key);
969                }
970            }
971            self.name_lookup.insert(*collection_digest, name_map);
972
973            let record = SequenceCollectionRecord::from(collection);
974            self.collections.insert(*collection_digest, record);
975        } else {
976            let sequences_data: Vec<(SequenceMetadata, DigestKey, DigestKey)> =
977                if let Some(SequenceCollectionRecord::Full { sequences, .. }) =
978                    self.collections.get(collection_digest)
979                {
980                    sequences
981                        .iter()
982                        .map(|seq| {
983                            let metadata = seq.metadata().clone();
984                            let sha512_key = metadata.sha512t24u.to_key();
985                            let md5_key = metadata.md5.to_key();
986                            (metadata, sha512_key, md5_key)
987                        })
988                        .collect()
989                } else {
990                    Vec::new()
991                };
992
993            let mut name_map = IndexMap::new();
994            for (metadata, sha512_key, md5_key) in sequences_data {
995                name_map.insert(metadata.name.clone(), sha512_key);
996
997                if !self.sequence_store.contains_key(&sha512_key) {
998                    self.sequence_store
999                        .insert(sha512_key, SequenceRecord::Stub(metadata));
1000                    self.md5_lookup.insert(md5_key, sha512_key);
1001                }
1002            }
1003            self.name_lookup.insert(*collection_digest, name_map);
1004        }
1005
1006        Ok(())
1007    }
1008
1009    /// Ensure a sequence is loaded into memory
1010    pub(crate) fn ensure_sequence_loaded(&mut self, digest: &DigestKey) -> Result<()> {
1011        let record = self
1012            .sequence_store
1013            .get(digest)
1014            .ok_or_else(|| anyhow!("Sequence not found in store"))?;
1015
1016        if matches!(record, SequenceRecord::Full { .. }) {
1017            return Ok(());
1018        }
1019
1020        let digest_str = &record.metadata().sha512t24u;
1021        let template = self
1022            .seqdata_path_template
1023            .as_ref()
1024            .ok_or_else(|| anyhow!("No sequence data path template configured"))?;
1025
1026        let relative_path = Self::expand_template(digest_str, template)
1027            .to_string_lossy()
1028            .into_owned();
1029
1030        if !self.quiet {
1031            let cached = self
1032                .local_path
1033                .as_ref()
1034                .map(|p| p.join(&relative_path).exists())
1035                .unwrap_or(false);
1036            let verb = if cached { "Loading" } else { "Downloading" };
1037            eprintln!("{} sequence {}...", verb, digest_str);
1038        }
1039        let data = Self::fetch_file(
1040            &self.local_path,
1041            &self.remote_source,
1042            &relative_path,
1043            self.persist_to_disk,
1044            false,
1045        )?;
1046
1047        self.sequence_store.entry(*digest).and_modify(|r| {
1048            r.load_data(data);
1049        });
1050
1051        Ok(())
1052    }
1053
1054    // =========================================================================
1055    // Write methods
1056    // =========================================================================
1057
1058    /// Write the store using its configured paths.
1059    pub fn write(&self) -> Result<()> {
1060        if !self.persist_to_disk {
1061            return Err(anyhow!(
1062                "write() only works with disk-backed stores - use write_store_to_dir() instead"
1063            ));
1064        }
1065        self.write_index_files()
1066    }
1067
1068    /// Write a RefgetStore object to a directory
1069    pub fn write_store_to_dir<P: AsRef<Path>>(
1070        &self,
1071        root_path: P,
1072        seqdata_path_template: Option<&str>,
1073    ) -> Result<()> {
1074        let root_path = root_path.as_ref();
1075
1076        let template = seqdata_path_template
1077            .or(self.seqdata_path_template.as_deref())
1078            .unwrap_or(DEFAULT_SEQDATA_PATH_TEMPLATE);
1079
1080        if !self.quiet {
1081            eprintln!(
1082                "Writing store to directory: {}; Using seqdata path template: {}",
1083                root_path.display(),
1084                template
1085            );
1086        }
1087
1088        fs::create_dir_all(root_path)?;
1089
1090        let sequences_dir = root_path.join("sequences");
1091        fs::create_dir_all(&sequences_dir)?;
1092
1093        let collections_dir = root_path.join("collections");
1094        fs::create_dir_all(&collections_dir)?;
1095
1096        for record in self.sequence_store.values() {
1097            match record {
1098                SequenceRecord::Full { metadata, .. } => {
1099                    let rel_path = Self::expand_template(&metadata.sha512t24u, template);
1100                    let full_path = root_path.join(&rel_path);
1101                    record.to_file(full_path)?;
1102                }
1103                SequenceRecord::Stub(_) => {
1104                    continue;
1105                }
1106            }
1107        }
1108
1109        for record in self.collections.values() {
1110            let collection_file_path =
1111                root_path.join(format!("collections/{}.rgsi", record.metadata().digest));
1112            record.write_collection_rgsi(&collection_file_path)?;
1113        }
1114
1115        let sequence_index_path = root_path.join("sequences.rgsi");
1116        self.write_sequences_rgsi(&sequence_index_path)?;
1117
1118        let collection_index_path = root_path.join("collections.rgci");
1119        self.write_collections_rgci(&collection_index_path)?;
1120
1121        let aliases_dir = root_path.join("aliases");
1122        self.aliases.write_to_dir(&aliases_dir)?;
1123
1124        super::fhr_metadata::write_sidecars(&root_path.join("fhr"), &self.fhr_metadata)?;
1125
1126        self.write_rgstore_json(root_path, template)?;
1127
1128        Ok(())
1129    }
1130
1131    /// Returns statistics about the store
1132    pub fn stats(&self) -> StoreStats {
1133        let n_sequences = self.sequence_store.len();
1134        let n_sequences_loaded = self
1135            .sequence_store
1136            .values()
1137            .filter(|record| record.is_loaded())
1138            .count();
1139        let n_collections = self.collections.len();
1140        let n_collections_loaded = self
1141            .collections
1142            .values()
1143            .filter(|record| record.has_sequences())
1144            .count();
1145        let mode_str = match self.mode {
1146            StorageMode::Raw => "Raw",
1147            StorageMode::Encoded => "Encoded",
1148        };
1149        StoreStats {
1150            n_sequences,
1151            n_sequences_loaded,
1152            n_collections,
1153            n_collections_loaded,
1154            storage_mode: mode_str.to_string(),
1155        }
1156    }
1157
1158    /// List alias namespaces available on this store (from manifest).
1159    pub fn available_alias_namespaces(&self) -> AvailableAliases<'_> {
1160        AvailableAliases {
1161            sequences: &self.available_sequence_alias_namespaces,
1162            collections: &self.available_collection_alias_namespaces,
1163        }
1164    }
1165}
1166
1167impl Display for ReadonlyRefgetStore {
1168    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1169        let total_size = self.total_disk_size();
1170        let size_str = format_bytes(total_size);
1171        writeln!(f, "ReadonlyRefgetStore object:")?;
1172        writeln!(f, "  Mode: {:?}", self.mode)?;
1173        writeln!(f, "  Disk size: {} ({} bytes)", size_str, total_size)?;
1174        writeln!(f, ">Sequences (n={}):", self.sequence_store.len())?;
1175        for (i, (sha512_digest, sequence_record)) in self.sequence_store.iter().take(10).enumerate()
1176        {
1177            let metadata = sequence_record.metadata();
1178            let first_8_chars = match sequence_record {
1179                SequenceRecord::Stub(_) => "<stub>".to_string(),
1180                SequenceRecord::Full {
1181                    metadata,
1182                    sequence: seq,
1183                } => {
1184                    match self.mode {
1185                        StorageMode::Encoded => {
1186                            let alphabet = lookup_alphabet(&metadata.alphabet);
1187                            let decoded = decode_substring_from_bytes(
1188                                seq,
1189                                0,
1190                                8.min(metadata.length),
1191                                alphabet,
1192                            );
1193                            String::from_utf8(decoded).unwrap_or_else(|_| "???".to_string())
1194                        }
1195                        StorageMode::Raw => String::from_utf8(seq[0..8.min(seq.len())].to_vec())
1196                            .unwrap_or_else(|_| "???".to_string()),
1197                    }
1198                }
1199            };
1200
1201            writeln!(
1202                f,
1203                "   - {}. {:02x?}, MD5: {:02x?}, Length: {}, Alphabet: {:?}, Start: {}",
1204                i + 1,
1205                key_to_digest_string(sha512_digest),
1206                &metadata.md5,
1207                &metadata.length,
1208                &metadata.alphabet,
1209                first_8_chars
1210            )?;
1211        }
1212        writeln!(f, ">Collections (n={:?}):", self.name_lookup.len())?;
1213        for (i, (digest, name_map)) in self.name_lookup.iter().enumerate() {
1214            let seqcol_digest_str = key_to_digest_string(digest);
1215            writeln!(
1216                f,
1217                "  {}. Collection Digest: {:02x?} ({} sequences)",
1218                i + 1,
1219                seqcol_digest_str,
1220                name_map.len()
1221            )?;
1222            for (name, sha512_digest) in name_map.iter().take(5) {
1223                let sha512_str = key_to_digest_string(sha512_digest);
1224                writeln!(f, "   - Name: {}, SHA512: {:02x?}", name, sha512_str)?;
1225            }
1226            if name_map.len() > 5 {
1227                writeln!(f, "   - ... and {} more", name_map.len() - 5)?;
1228            }
1229        }
1230
1231        Ok(())
1232    }
1233}
1234
1235// Extension traits used by collection.rs
1236use crate::collection::SequenceCollectionRecordExt;