Skip to main content

gtars_refget/
store.rs

1//! # RefgetStore
2//!
3//! A store for managing reference genome sequences with support for both
4//! in-memory and disk-backed storage.
5//!
6//! ## Store Creation Patterns
7//!
8//! ### New stores (empty)
9//! - `in_memory()` - All data in RAM, fast but lost on drop
10//! - `on_disk(path)` - Sequences written to disk immediately, only metadata in RAM
11//!
12//! ### Loading existing stores
13//! - `load_local(path)` - Load from local directory (lazy-loads sequences)
14//! - `load_remote(path, url)` - Load from URL, caches to local directory
15//!
16//! ## Runtime Configuration
17//!
18//! ### Persistence control
19//! - `enable_persistence(path)` - Start writing to disk, flush in-memory data
20//! - `disable_persistence()` - Stop writing to disk (can still read)
21//!
22//! ### Encoding control
23//! - `set_encoding_mode(mode)` - Switch between Raw and Encoded storage
24//! - `enable_encoding()` - Use 2-bit encoding (space efficient)
25//! - `disable_encoding()` - Use raw bytes
26
27use crate::digest::{AlphabetType, lookup_alphabet};
28use seq_io::fasta::{Reader, Record};
29use std::collections::HashMap;
30use std::ffi::OsStr;
31use std::fmt::{Display, Formatter};
32use std::path::{Path, PathBuf};
33use std::time::Instant;
34
35use crate::collection::{
36    SequenceCollectionExt, SequenceCollectionRecordExt, SequenceMetadataExt, SequenceRecordExt,
37    read_rgsi_file,
38};
39use crate::digest::{
40    SequenceCollection, SequenceCollectionMetadata, SequenceCollectionRecord, SequenceMetadata,
41    SequenceRecord, parse_rgsi_line,
42};
43use crate::digest::{
44    SequenceEncoder, decode_string_from_bytes, decode_substring_from_bytes, encode_sequence,
45};
46use crate::hashkeyable::HashKeyable;
47use anyhow::anyhow;
48use anyhow::{Context, Result};
49use chrono::Utc;
50use flate2::Compression;
51use flate2::read::GzDecoder;
52use flate2::write::GzEncoder;
53use gtars_core::utils::{get_dynamic_reader, get_file_info, parse_bedlike_file};
54use serde::{Deserialize, Serialize};
55use std::fs::{self, File, create_dir_all};
56use std::io::{BufRead, BufReader, Read, Write};
57use std::str;
58
59// const DEFAULT_COLLECTION_ID: [u8; 32] = [0u8; 32]; // Default collection ID for the name lookup table
60
61const DEFAULT_COLLECTION_ID: &str = "DEFAULT_REFGET_SEQUENCE_COLLECTION"; // Default collection ID for the name lookup table
62const DEFAULT_SEQDATA_PATH_TEMPLATE: &str = "sequences/%s2/%s.seq"; // Default template for sequence file paths
63
64/// Parse a single line from an RGCI (collection index) file.
65///
66/// RGCI format is tab-separated with 5 columns:
67/// digest, n_sequences, names_digest, sequences_digest, lengths_digest
68///
69/// Lines starting with '#' are treated as comments and return None.
70/// Lines with fewer than 5 columns return None.
71fn parse_rgci_line(line: &str) -> Option<SequenceCollectionMetadata> {
72    if line.starts_with('#') {
73        return None;
74    }
75    let parts: Vec<&str> = line.split('\t').collect();
76    if parts.len() < 5 {
77        return None;
78    }
79    Some(SequenceCollectionMetadata {
80        digest: parts[0].to_string(),
81        n_sequences: parts[1].parse().ok()?,
82        names_digest: parts[2].to_string(),
83        sequences_digest: parts[3].to_string(),
84        lengths_digest: parts[4].to_string(),
85        file_path: None,
86    })
87}
88
89/// Enum storing whether sequences will be stored in Raw or Encoded form
90#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)]
91pub enum StorageMode {
92    Raw,
93    Encoded,
94}
95
96#[derive(Debug, Clone, PartialEq, Eq)]
97pub struct RetrievedSequence {
98    pub sequence: String,
99    pub chrom_name: String,
100    pub start: u32,
101    pub end: u32,
102}
103
104/// Global store handling cross-collection sequence management
105/// Holds a global sequence_store, which holds all sequences (across collections) so that
106/// sequences are deduplicated.
107/// This allows lookup by sequence digest directly (bypassing collection information).
108/// The RefgetStore also holds a collections hashmap, to provide lookup by collection+name
109#[derive(Debug)]
110pub struct RefgetStore {
111    /// SHA512t24u digest -> SequenceRecord (metadata + optional data)
112    sequence_store: HashMap<[u8; 32], SequenceRecord>,
113    /// MD5 digest -> SHA512t24u digest lookup
114    md5_lookup: HashMap<[u8; 32], [u8; 32]>,
115
116    /// Collection digest -> {name -> SHA512t24u digest}
117    name_lookup: HashMap<[u8; 32], HashMap<String, [u8; 32]>>,
118    /// Active sequence collections (now using SequenceCollectionRecord for Stub/Full pattern)
119    collections: HashMap<[u8; 32], SequenceCollectionRecord>,
120    /// Storage strategy for sequences
121    mode: StorageMode,
122    /// Where the store lives on disk (local store or cache directory)
123    local_path: Option<PathBuf>,
124    /// Where to pull sequences from (if remote-backed)
125    remote_source: Option<String>,
126    /// Template for sequence file paths (e.g., "sequences/%s2/%s.seq")
127    seqdata_path_template: Option<String>,
128    /// Whether to persist sequences to disk (write-through caching)
129    persist_to_disk: bool,
130    /// Whether to suppress progress output
131    quiet: bool,
132}
133
134/// Metadata for the entire store.
135/// This is used to serialize metadata to `rgstore.json`, which can be loaded by the application.
136#[derive(Serialize, Deserialize, Debug)]
137struct StoreMetadata {
138    /// Version of the metadata format
139    version: u32,
140    /// Template for sequence file paths
141    seqdata_path_template: String,
142    /// Template for collection file paths
143    collections_path_template: String,
144    /// Path to the sequence metadata index file
145    sequence_index: String,
146    /// Path to the collection metadata index file (NEW)
147    #[serde(default)]
148    collection_index: Option<String>,
149    /// Storage mode (Raw or Encoded)
150    mode: StorageMode,
151    /// Creation timestamp
152    created_at: String,
153}
154
155pub struct SubstringsFromRegions<'a, K>
156where
157    K: AsRef<[u8]>,
158{
159    store: &'a mut RefgetStore,
160    reader: BufReader<Box<dyn Read>>,
161    collection_digest: K,
162    previous_parsed_chr: String,
163    current_seq_digest: String,
164    line_num: usize,
165}
166
167impl<K> Iterator for SubstringsFromRegions<'_, K>
168where
169    K: AsRef<[u8]>,
170{
171    type Item = Result<RetrievedSequence, Box<dyn std::error::Error>>;
172
173    fn next(&mut self) -> Option<Self::Item> {
174        let mut line_string = String::new();
175
176        let num_bytes = self.reader.read_line(&mut line_string);
177        match num_bytes {
178            Ok(bytes) => {
179                if bytes == 0 {
180                    return None;
181                }
182            }
183            Err(err) => return Some(Err(err.into())),
184        };
185
186        self.line_num += 1;
187
188        let (parsed_chr, parsed_start, parsed_end) = match parse_bedlike_file(line_string.trim()) {
189            Some(coords) => coords,
190            None => {
191                let err_str = format!(
192                    "Error reading line {} because it could not be parsed as a BED-like entry: '{}'",
193                    self.line_num + 1,
194                    line_string
195                );
196                return Some(Err(err_str.into()));
197            }
198        };
199
200        if parsed_start == -1 || parsed_end == -1 {
201            let err_str = format!(
202                "Error reading line {} due to invalid start or end coordinates: '{}'",
203                self.line_num + 1,
204                line_string
205            );
206            return Some(Err(err_str.into()));
207        }
208
209        if self.previous_parsed_chr != parsed_chr {
210            self.previous_parsed_chr = parsed_chr.clone();
211
212            let result = match self
213                .store
214                .get_sequence_by_name(&self.collection_digest, &parsed_chr)
215            {
216                Ok(seq_record) => seq_record,
217                Err(e) => {
218                    let err_str = format!(
219                        "Line {}: sequence '{}' not found in collection '{}': {}",
220                        self.line_num + 1,
221                        parsed_chr,
222                        String::from_utf8_lossy(self.collection_digest.as_ref()),
223                        e
224                    );
225                    return Some(Err(err_str.into()));
226                }
227            };
228
229            self.current_seq_digest = result.metadata().sha512t24u.clone();
230        }
231
232        let retrieved_substring = match self.store.get_substring(
233            &self.current_seq_digest,
234            parsed_start as usize,
235            parsed_end as usize,
236        ) {
237            Ok(substring) => substring,
238            Err(e) => {
239                let err_str = format!(
240                    "Line {}: failed to get substring for digest '{}' from {} to {}: {}",
241                    self.line_num + 1,
242                    self.current_seq_digest,
243                    parsed_start,
244                    parsed_end,
245                    e
246                );
247                return Some(Err(err_str.into()));
248            }
249        };
250
251        Some(Ok(RetrievedSequence {
252            sequence: retrieved_substring,
253            chrom_name: parsed_chr,
254            start: parsed_start as u32, // Convert i32 to u32
255            end: parsed_end as u32,     // Convert i32 to u32
256        }))
257    }
258}
259
260impl RefgetStore {
261    /// Generic constructor. Creates a new, empty `RefgetStore`.
262    /// This is a private helper - use `on_disk()` or `in_memory()` instead.
263    fn new(mode: StorageMode) -> Self {
264        // Initialize the name lookup with a default collection
265        let mut name_lookup = HashMap::new();
266        name_lookup.insert(DEFAULT_COLLECTION_ID.to_key(), HashMap::new());
267
268        RefgetStore {
269            sequence_store: HashMap::new(),
270            md5_lookup: HashMap::new(),
271            name_lookup,
272            collections: HashMap::new(),
273            mode,
274            local_path: None,
275            remote_source: None,
276            seqdata_path_template: None,
277            persist_to_disk: false, // on_disk() overrides to true
278            quiet: false,
279        }
280    }
281
282    /// Set whether to suppress progress output.
283    ///
284    /// When quiet is true, operations like add_sequence_collection_from_fasta
285    /// will not print progress messages.
286    ///
287    /// # Arguments
288    /// * `quiet` - Whether to suppress progress output
289    pub fn set_quiet(&mut self, quiet: bool) {
290        self.quiet = quiet;
291    }
292
293    /// Returns whether the store is in quiet mode.
294    pub fn is_quiet(&self) -> bool {
295        self.quiet
296    }
297
298    /// Create a disk-backed RefgetStore
299    ///
300    /// Sequences are written to disk immediately and loaded on-demand (lazy loading).
301    /// Only metadata is kept in memory.
302    ///
303    /// # Arguments
304    /// * `cache_path` - Directory for storing sequences and metadata
305    /// * `mode` - Storage mode (Raw or Encoded)
306    ///
307    /// # Returns
308    /// Result with a configured disk-backed store
309    ///
310    /// # Example
311    /// ```ignore
312    /// let store = RefgetStore::on_disk("/data/store")?;
313    /// store.add_sequence_collection_from_fasta("genome.fa")?;
314    /// ```
315    pub fn on_disk<P: AsRef<Path>>(cache_path: P) -> Result<Self> {
316        let cache_path = cache_path.as_ref();
317        let index_path = cache_path.join("rgstore.json");
318
319        if index_path.exists() {
320            // Load existing store
321            Self::open_local(cache_path)
322        } else {
323            // Create new store with default Encoded mode
324            let mode = StorageMode::Encoded;
325            create_dir_all(cache_path)?;
326
327            // Use private new() helper
328            let mut store = Self::new(mode);
329            store.local_path = Some(cache_path.to_path_buf());
330            store.seqdata_path_template = Some(DEFAULT_SEQDATA_PATH_TEMPLATE.to_string());
331            store.persist_to_disk = true; // Always true for on_disk
332
333            // Create directory structure
334            create_dir_all(cache_path.join("sequences"))?;
335            create_dir_all(cache_path.join("collections"))?;
336
337            Ok(store)
338        }
339    }
340
341    /// Create an in-memory RefgetStore
342    ///
343    /// All sequences kept in RAM for fast access.
344    /// Defaults to Encoded storage mode (2-bit packing for space efficiency).
345    /// Use set_encoding_mode() to change storage mode after creation.
346    ///
347    /// # Example
348    /// ```ignore
349    /// let store = RefgetStore::in_memory();
350    /// store.add_sequence_collection_from_fasta("genome.fa")?;
351    /// ```
352    pub fn in_memory() -> Self {
353        Self::new(StorageMode::Encoded)
354    }
355
356    /// Change the storage mode, re-encoding/decoding existing sequences as needed.
357    ///
358    /// When switching from Raw to Encoded:
359    /// - All Full sequences in memory are encoded (2-bit packed)
360    ///
361    /// When switching from Encoded to Raw:
362    /// - All Full sequences in memory are decoded back to raw bytes
363    ///
364    /// Note: Stub sequences (lazy-loaded from disk) are not affected.
365    /// They will be loaded in the NEW mode when accessed.
366    ///
367    /// # Arguments
368    /// * `new_mode` - The storage mode to switch to
369    pub fn set_encoding_mode(&mut self, new_mode: StorageMode) {
370        if self.mode == new_mode {
371            return; // No change needed
372        }
373
374        // Re-encode/decode all Full sequences in memory
375        for record in self.sequence_store.values_mut() {
376            match record {
377                SequenceRecord::Full { metadata, sequence } => {
378                    match (self.mode, new_mode) {
379                        (StorageMode::Raw, StorageMode::Encoded) => {
380                            // Encode: raw bytes -> 2-bit packed
381                            let alphabet = lookup_alphabet(&metadata.alphabet);
382                            *sequence = encode_sequence(&*sequence, alphabet);
383                        }
384                        (StorageMode::Encoded, StorageMode::Raw) => {
385                            // Decode: 2-bit packed -> raw bytes
386                            let alphabet = lookup_alphabet(&metadata.alphabet);
387                            *sequence =
388                                decode_string_from_bytes(&*sequence, metadata.length, alphabet);
389                        }
390                        _ => {} // Same mode, no conversion needed
391                    }
392                }
393                SequenceRecord::Stub(_) => {
394                    // Stubs don't hold sequence data, nothing to convert
395                }
396            }
397        }
398
399        self.mode = new_mode;
400    }
401
402    /// Enable 2-bit encoding for space efficiency.
403    /// Re-encodes any existing Raw sequences in memory.
404    pub fn enable_encoding(&mut self) {
405        self.set_encoding_mode(StorageMode::Encoded);
406    }
407
408    /// Disable encoding, use raw byte storage.
409    /// Decodes any existing Encoded sequences in memory.
410    pub fn disable_encoding(&mut self) {
411        self.set_encoding_mode(StorageMode::Raw);
412    }
413
414    /// Enable disk persistence for this store.
415    ///
416    /// Sets up the store to write sequences to disk. Any in-memory Full sequences
417    /// are flushed to disk and converted to Stubs.
418    ///
419    /// # Arguments
420    /// * `path` - Directory for storing sequences and metadata
421    ///
422    /// # Returns
423    /// Result indicating success or error
424    pub fn enable_persistence<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
425        let path = path.as_ref();
426
427        // Set up persistence configuration
428        self.local_path = Some(path.to_path_buf());
429        self.persist_to_disk = true;
430        self.seqdata_path_template
431            .get_or_insert_with(|| DEFAULT_SEQDATA_PATH_TEMPLATE.to_string());
432
433        // Create directory structure
434        create_dir_all(path.join("sequences"))?;
435        create_dir_all(path.join("collections"))?;
436
437        // Flush any in-memory Full sequences to disk
438        let keys: Vec<[u8; 32]> = self.sequence_store.keys().cloned().collect();
439        for key in keys {
440            if let Some(SequenceRecord::Full { metadata, sequence }) = self.sequence_store.get(&key)
441            {
442                // Write to disk
443                self.write_sequence_to_disk_single(metadata, sequence)?;
444                // Convert to stub
445                let stub = SequenceRecord::Stub(metadata.clone());
446                self.sequence_store.insert(key, stub);
447            }
448        }
449
450        // Write all collections to disk
451        for record in self.collections.values() {
452            self.write_collection_to_disk_single(record)?;
453        }
454
455        // Write index files
456        self.write_index_files()?;
457
458        Ok(())
459    }
460
461    /// Disable disk persistence for this store.
462    ///
463    /// New sequences will be kept in memory only. Existing Stub sequences
464    /// can still be loaded from disk if local_path is set.
465    pub fn disable_persistence(&mut self) {
466        self.persist_to_disk = false;
467    }
468
469    /// Check if persistence to disk is enabled.
470    pub fn is_persisting(&self) -> bool {
471        self.persist_to_disk
472    }
473
474    /// Adds a sequence to the Store
475    /// Ensure that it is added to the appropriate collection.
476    /// If no collection is specified, it will be added to the default collection.
477    ///
478    /// # Arguments
479    /// * `sequence_record` - The sequence to add
480    /// * `collection_digest` - Collection to add to (or None for default)
481    /// * `force` - If true, overwrite existing sequences. If false, skip duplicates.
482    // Using Into here  instead of the Option direction allows us to accept
483    // either None or [u8; 32], without having to wrap it in Some().
484    pub fn add_sequence<T: Into<Option<[u8; 32]>>>(
485        &mut self,
486        sequence_record: SequenceRecord,
487        collection_digest: T,
488        force: bool,
489    ) -> Result<()> {
490        // Ensure collection exists; otherwise use the default collection
491        let collection_digest = collection_digest
492            .into()
493            .unwrap_or(DEFAULT_COLLECTION_ID.to_key());
494        self.collections.get(&collection_digest).ok_or_else(|| {
495            anyhow::anyhow!("Collection not found for digest: {:?}", collection_digest)
496        })?;
497
498        // Get metadata from the record (works for both Stub and Full variants)
499        let metadata = sequence_record.metadata();
500
501        // Add to name lookup for the collection
502        self.name_lookup
503            .entry(collection_digest)
504            .or_default()
505            .insert(metadata.name.clone(), metadata.sha512t24u.to_key());
506
507        // Finally, add SequenceRecord to store (consuming the object)
508        self.add_sequence_record(sequence_record, force)?;
509
510        Ok(())
511    }
512
513    /// Adds a collection, and all sequences in it, to the store.
514    ///
515    /// Skips collections and sequences that already exist.
516    /// Use `add_sequence_collection_force()` to overwrite existing data.
517    ///
518    /// # Arguments
519    /// * `collection` - The sequence collection to add
520    pub fn add_sequence_collection(&mut self, collection: SequenceCollection) -> Result<()> {
521        self.add_sequence_collection_internal(collection, false)
522    }
523
524    /// Adds a collection, and all sequences in it, to the store, overwriting existing data.
525    ///
526    /// Forces overwrite of collections and sequences that already exist.
527    /// Use `add_sequence_collection()` to skip duplicates (safer default).
528    ///
529    /// # Arguments
530    /// * `collection` - The sequence collection to add
531    pub fn add_sequence_collection_force(&mut self, collection: SequenceCollection) -> Result<()> {
532        self.add_sequence_collection_internal(collection, true)
533    }
534
535    /// Internal implementation for adding a sequence collection.
536    fn add_sequence_collection_internal(
537        &mut self,
538        collection: SequenceCollection,
539        force: bool,
540    ) -> Result<()> {
541        let coll_digest = collection.metadata.digest.to_key();
542
543        // Check if collection already exists
544        if !force && self.collections.contains_key(&coll_digest) {
545            // Skip - collection already exists and force=false
546            return Ok(());
547        }
548
549        // Convert to SequenceCollectionRecord
550        let record = SequenceCollectionRecord::from(collection.clone());
551
552        // Write collection to disk if persist_to_disk is enabled (before moving sequences)
553        if self.persist_to_disk && self.local_path.is_some() {
554            self.write_collection_to_disk_single(&record)?;
555        }
556
557        // Register the collection record
558        self.collections.insert(coll_digest, record);
559
560        // Add all sequences in the collection to the store
561        for sequence_record in collection.sequences {
562            self.add_sequence(sequence_record, coll_digest, force)?;
563        }
564
565        // Write index files so store is immediately loadable
566        if self.persist_to_disk && self.local_path.is_some() {
567            self.write_index_files()?;
568        }
569
570        Ok(())
571    }
572
573    // Adds SequenceRecord to the store.
574    // Should only be used internally, via `add_sequence`, which ensures sequences are added to collections.
575    // If the store is disk-backed (persist_to_disk=true), Full records are written to disk and replaced with Stubs.
576    fn add_sequence_record(&mut self, sr: SequenceRecord, force: bool) -> Result<()> {
577        let metadata = sr.metadata();
578        let key = metadata.sha512t24u.to_key();
579
580        // Check if sequence already exists
581        if !force && self.sequence_store.contains_key(&key) {
582            // Skip - sequence already exists and force=false
583            return Ok(());
584        }
585
586        self.md5_lookup
587            .insert(metadata.md5.to_key(), metadata.sha512t24u.to_key());
588
589        // Check if we should write Full records to disk
590        if self.persist_to_disk && self.local_path.is_some() {
591            match &sr {
592                SequenceRecord::Full { metadata, sequence } => {
593                    // Write to disk
594                    self.write_sequence_to_disk_single(metadata, sequence)?;
595                    // Store as stub instead
596                    let stub = SequenceRecord::Stub(metadata.clone());
597                    self.sequence_store.insert(key, stub);
598                    return Ok(());
599                }
600                SequenceRecord::Stub(_) => {
601                    // Already a stub, just add it normally below
602                }
603            }
604        }
605
606        // Add as-is (either memory-only mode, or already a Stub)
607        self.sequence_store.insert(key, sr);
608        Ok(())
609    }
610
611    /// Add a sequence collection from a FASTA file.
612    ///
613    /// Skips sequences and collections that already exist in the store.
614    /// Use `add_sequence_collection_from_fasta_force()` to overwrite existing data.
615    ///
616    /// # Arguments
617    /// * `file_path` - Path to the FASTA file
618    ///
619    /// # Returns
620    /// A tuple of (SequenceCollectionMetadata, was_new) where was_new indicates
621    /// whether the collection was newly added (true) or already existed (false).
622    ///
623    /// # Notes
624    /// Loading sequence data requires 2 passes through the FASTA file:
625    /// 1. First pass digests and guesses the alphabet to produce SequenceMetadata
626    /// 2. Second pass encodes the sequences based on the detected alphabet
627    pub fn add_sequence_collection_from_fasta<P: AsRef<Path>>(
628        &mut self,
629        file_path: P,
630    ) -> Result<(SequenceCollectionMetadata, bool)> {
631        self.add_sequence_collection_from_fasta_internal(file_path, false)
632    }
633
634    /// Add a sequence collection from a FASTA file, overwriting existing data.
635    ///
636    /// Forces overwrite of collections and sequences that already exist in the store.
637    /// Use `add_sequence_collection_from_fasta()` to skip duplicates (safer default).
638    ///
639    /// # Arguments
640    /// * `file_path` - Path to the FASTA file
641    ///
642    /// # Returns
643    /// A tuple of (SequenceCollectionMetadata, was_new) where was_new is always true
644    /// since force mode always overwrites.
645    pub fn add_sequence_collection_from_fasta_force<P: AsRef<Path>>(
646        &mut self,
647        file_path: P,
648    ) -> Result<(SequenceCollectionMetadata, bool)> {
649        self.add_sequence_collection_from_fasta_internal(file_path, true)
650    }
651
652    /// Internal implementation for adding a sequence collection from FASTA.
653    /// Returns (SequenceCollectionMetadata, was_new) where was_new indicates if the collection was added.
654    fn add_sequence_collection_from_fasta_internal<P: AsRef<Path>>(
655        &mut self,
656        file_path: P,
657        force: bool,
658    ) -> Result<(SequenceCollectionMetadata, bool)> {
659        // Print start message
660        if !self.quiet {
661            println!("Processing {}...", file_path.as_ref().display());
662        }
663
664        // Phase 1: Digest computation
665        let digest_start = Instant::now();
666        let seqcol = SequenceCollection::from_fasta(&file_path)?;
667        let digest_elapsed = digest_start.elapsed();
668
669        // Get metadata directly from the collection
670        let metadata = seqcol.metadata.clone();
671
672        // Check if collection already exists and skip if not forcing
673        if !force
674            && self
675                .collections
676                .contains_key(&seqcol.metadata.digest.to_key())
677        {
678            if !self.quiet {
679                println!("Skipped {} (already exists)", seqcol.metadata.digest);
680            }
681            return Ok((metadata, false));
682        }
683
684        // Register the collection
685        self.add_sequence_collection_internal(seqcol.clone(), force)?;
686
687        // Local hashmap to store SequenceMetadata (digests)
688        let mut seqmeta_hashmap: HashMap<String, SequenceMetadata> = HashMap::new();
689        let seqcol_sequences = seqcol.sequences.clone(); // Clone to avoid partial move
690        for record in seqcol_sequences {
691            let seqmeta = record.metadata().clone();
692            seqmeta_hashmap.insert(seqmeta.name.clone(), seqmeta);
693        }
694
695        let file_reader = get_dynamic_reader(file_path.as_ref())?;
696        let mut fasta_reader = Reader::new(file_reader);
697
698        // Phase 2: Load/encode sequences
699        let encode_start = Instant::now();
700
701        let mut seq_count = 0;
702        while let Some(record) = fasta_reader.next() {
703            let record = record?;
704            let header = std::str::from_utf8(record.head())?;
705            // Parse header to get name (first word) - same logic as digest_fasta
706            let (name, _description) = crate::fasta::parse_fasta_header(header);
707            let dr = seqmeta_hashmap
708                .get(&name)
709                .ok_or_else(|| {
710                    let available_keys: Vec<_> = seqmeta_hashmap.keys().collect();
711                    let total = available_keys.len();
712                    let sample: Vec<_> = available_keys.iter().take(3).collect();
713                    anyhow::anyhow!(
714                        "Sequence '{}' not found in metadata. Available ({} total): {:?}{}",
715                        name,
716                        total,
717                        sample,
718                        if total > 3 { " ..." } else { "" }
719                    )
720                })?
721                .clone();
722
723            seq_count += 1;
724
725            match self.mode {
726                StorageMode::Raw => {
727                    let mut raw_sequence = Vec::with_capacity(dr.length);
728                    // For raw, just extend with the line content.
729                    for seq_line in record.seq_lines() {
730                        raw_sequence.extend(seq_line);
731                    }
732
733                    // Always replace Stubs with Full sequences from FASTA
734                    self.add_sequence(
735                        SequenceRecord::Full {
736                            metadata: dr,
737                            sequence: raw_sequence,
738                        },
739                        seqcol.metadata.digest.to_key(),
740                        true, // Always replace Stubs with Full
741                    )?;
742                }
743                StorageMode::Encoded => {
744                    // Create a SequenceEncoder to handle the encoding of the sequence.
745                    let mut encoder = SequenceEncoder::new(dr.alphabet, dr.length);
746                    for seq_line in record.seq_lines() {
747                        encoder.update(seq_line);
748                    }
749                    let encoded_sequence = encoder.finalize();
750
751                    // Always replace Stubs with Full sequences from FASTA
752                    self.add_sequence(
753                        SequenceRecord::Full {
754                            metadata: dr,
755                            sequence: encoded_sequence,
756                        },
757                        seqcol.metadata.digest.to_key(),
758                        true, // Always replace Stubs with Full
759                    )?;
760                }
761            }
762        }
763
764        let encode_elapsed = encode_start.elapsed();
765
766        // Print summary with timing breakdown
767        if !self.quiet {
768            println!(
769                "Added {} ({} seqs) in {:.1}s [{:.1}s digest + {:.1}s encode]",
770                seqcol.metadata.digest,
771                seq_count,
772                digest_elapsed.as_secs_f64() + encode_elapsed.as_secs_f64(),
773                digest_elapsed.as_secs_f64(),
774                encode_elapsed.as_secs_f64()
775            );
776        }
777
778        // Note: If persist_to_disk=true, sequences were already written to disk
779        // and replaced with stubs by add_sequence_record()
780
781        Ok((metadata, true))
782    }
783
784    /// Returns an iterator over all sequence digests in the store
785    pub fn sequence_digests(&self) -> impl Iterator<Item = [u8; 32]> + '_ {
786        self.sequence_store.keys().cloned()
787    }
788
789    /// Returns an iterator over sequence metadata for all sequences in the store.
790    ///
791    /// This is a lightweight operation that returns only metadata (name, length, digests)
792    /// without loading sequence data.
793    ///
794    /// # Returns
795    /// An iterator over `SequenceMetadata` references.
796    ///
797    /// # Example
798    /// ```ignore
799    /// for metadata in store.sequence_metadata() {
800    ///     println!("{}: {} bp", metadata.name, metadata.length);
801    /// }
802    /// ```
803    pub fn sequence_metadata(&self) -> impl Iterator<Item = &SequenceMetadata> + '_ {
804        self.sequence_store.values().map(|rec| rec.metadata())
805    }
806
807    /// Calculate the total disk size of all sequences in the store
808    ///
809    /// This computes the disk space used by sequence data based on:
810    /// - Sequence length
811    /// - Alphabet type (bits per symbol)
812    /// - Storage mode (Raw or Encoded)
813    ///
814    /// # Returns
815    /// Total bytes used for sequence data on disk
816    ///
817    /// # Note
818    /// This only accounts for sequence data files (.seq), not metadata files
819    /// like RGSI files, rgstore.json, or directory overhead.
820    ///
821    /// # Examples
822    /// ```ignore
823    /// let store = RefgetStore::on_disk("store");
824    /// store.add_sequence_collection_from_fasta("genome.fa")?;
825    /// let disk_size = store.total_disk_size();
826    /// println!("Sequences use {} bytes on disk", disk_size);
827    /// ```
828    pub fn total_disk_size(&self) -> usize {
829        self.sequence_store
830            .values()
831            .map(|rec| rec.metadata().disk_size(&self.mode))
832            .sum()
833    }
834
835    /// Returns the actual disk usage of the store directory.
836    ///
837    /// Walks the local_path directory (if set) and sums all file sizes.
838    /// For in-memory stores without a local_path, returns 0.
839    ///
840    /// This is useful for stats reporting to show actual disk consumption
841    /// regardless of whether sequences are loaded in memory.
842    pub fn actual_disk_usage(&self) -> usize {
843        let Some(path) = &self.local_path else {
844            return 0;
845        };
846
847        fn dir_size(path: &std::path::Path) -> usize {
848            let mut total = 0;
849            if let Ok(entries) = std::fs::read_dir(path) {
850                for entry in entries.flatten() {
851                    let path = entry.path();
852                    if path.is_file() {
853                        total += entry.metadata().map(|m| m.len() as usize).unwrap_or(0);
854                    } else if path.is_dir() {
855                        total += dir_size(&path);
856                    }
857                }
858            }
859            total
860        }
861
862        dir_size(path)
863    }
864
865    // =========================================================================
866    // Collection API
867    // =========================================================================
868
869    /// List all collections in the store (metadata only, no sequence data).
870    ///
871    /// Returns metadata for all collections without loading sequence data.
872    /// Use this for browsing/inventory operations.
873    ///
874    /// # Example
875    /// ```ignore
876    /// for meta in store.list_collections() {
877    ///     println!("{}: {} sequences", meta.digest, meta.n_sequences);
878    /// }
879    /// ```
880    pub fn list_collections(&self) -> Vec<SequenceCollectionMetadata> {
881        let mut result: Vec<_> = self
882            .collections
883            .values()
884            .map(|record| record.metadata().clone())
885            .collect();
886        result.sort_by(|a, b| a.digest.cmp(&b.digest));
887        result
888    }
889
890    /// Get metadata for a single collection by digest (no sequence data).
891    ///
892    /// Use this for lightweight lookups when you don't need sequence data.
893    pub fn get_collection_metadata<K: AsRef<[u8]>>(
894        &self,
895        collection_digest: K,
896    ) -> Option<&SequenceCollectionMetadata> {
897        let key = collection_digest.to_key();
898        self.collections.get(&key).map(|record| record.metadata())
899    }
900
901    /// Get a collection with all its sequences loaded.
902    ///
903    /// This loads the collection metadata and all sequence data, returning
904    /// a complete `SequenceCollection` ready for use.
905    ///
906    /// # Example
907    /// ```ignore
908    /// let collection = store.get_collection("abc123")?;
909    /// for seq in &collection.sequences {
910    ///     println!("{}: {}", seq.metadata().name, seq.decode()?);
911    /// }
912    /// ```
913    pub fn get_collection(&mut self, collection_digest: &str) -> Result<SequenceCollection> {
914        let key = collection_digest.to_key();
915        self.ensure_collection_loaded(&key)?;
916
917        // Get all sequence digests for this collection
918        let seq_digests: Vec<[u8; 32]> = self
919            .name_lookup
920            .get(&key)
921            .map(|name_map| name_map.values().cloned().collect())
922            .unwrap_or_default();
923
924        // NOTE: We do NOT load sequence data here - that would be too slow for remote stores
925        // with hundreds of sequences. Sequences are returned as Stubs with metadata.
926        // Call decode() on individual sequences to load their data on demand.
927
928        // Get collection metadata
929        let metadata = self
930            .collections
931            .get(&key)
932            .ok_or_else(|| anyhow!("Collection not found: {}", collection_digest))?
933            .metadata()
934            .clone();
935
936        // Build sequences list from sequence_store (as Stubs with metadata only)
937        let sequences: Vec<SequenceRecord> = seq_digests
938            .iter()
939            .filter_map(|seq_key| self.sequence_store.get(seq_key).cloned())
940            .collect();
941
942        Ok(SequenceCollection {
943            metadata,
944            sequences,
945        })
946    }
947
948    // =========================================================================
949    // Sequence API
950    // =========================================================================
951
952    /// List all sequences in the store (metadata only, no sequence data).
953    ///
954    /// Returns metadata for all sequences without loading sequence data.
955    /// Use this for browsing/inventory operations.
956    ///
957    /// # Example
958    /// ```ignore
959    /// for meta in store.list_sequences() {
960    ///     println!("{}: {} bp", meta.name, meta.length);
961    /// }
962    /// ```
963    pub fn list_sequences(&self) -> Vec<SequenceMetadata> {
964        let mut result: Vec<_> = self
965            .sequence_store
966            .values()
967            .map(|rec| rec.metadata().clone())
968            .collect();
969        result.sort_by(|a, b| a.sha512t24u.cmp(&b.sha512t24u));
970        result
971    }
972
973    /// Get metadata for a single sequence by digest (no sequence data).
974    ///
975    /// Use this for lightweight lookups when you don't need the actual sequence.
976    pub fn get_sequence_metadata<K: AsRef<[u8]>>(
977        &self,
978        seq_digest: K,
979    ) -> Option<&SequenceMetadata> {
980        let key = seq_digest.to_key();
981        self.sequence_store.get(&key).map(|rec| rec.metadata())
982    }
983
984    /// Get a sequence by its SHA512t24u digest, loading data if needed.
985    ///
986    /// # Example
987    /// ```ignore
988    /// let seq = store.get_sequence("abc123")?;
989    /// println!("{}: {}", seq.metadata().name, seq.decode()?);
990    /// ```
991    pub fn get_sequence<K: AsRef<[u8]>>(&mut self, seq_digest: K) -> Result<&SequenceRecord> {
992        let digest_key = seq_digest.to_key();
993        // Try MD5 lookup first, fallback to using digest directly (SHA512t24u)
994        let actual_key = self
995            .md5_lookup
996            .get(&digest_key)
997            .copied()
998            .unwrap_or(digest_key);
999        self.ensure_sequence_loaded(&actual_key)?;
1000        self.sequence_store.get(&actual_key).ok_or_else(|| {
1001            anyhow!(
1002                "Sequence not found: {}",
1003                String::from_utf8_lossy(seq_digest.as_ref())
1004            )
1005        })
1006    }
1007
1008    /// Get a sequence by collection digest and name, loading data if needed.
1009    ///
1010    /// # Example
1011    /// ```ignore
1012    /// let seq = store.get_sequence_by_name("collection123", "chr1")?;
1013    /// println!("{}", seq.decode()?);
1014    /// ```
1015    pub fn get_sequence_by_name<K: AsRef<[u8]>>(
1016        &mut self,
1017        collection_digest: K,
1018        sequence_name: &str,
1019    ) -> Result<&SequenceRecord> {
1020        let collection_key = collection_digest.to_key();
1021        self.ensure_collection_loaded(&collection_key)?;
1022
1023        let digest_key = if let Some(name_map) = self.name_lookup.get(&collection_key) {
1024            name_map
1025                .get(sequence_name)
1026                .cloned()
1027                .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", sequence_name))?
1028        } else {
1029            return Err(anyhow!(
1030                "Collection not found: {}",
1031                String::from_utf8_lossy(collection_digest.as_ref())
1032            ));
1033        };
1034
1035        self.ensure_sequence_loaded(&digest_key)?;
1036        self.sequence_store.get(&digest_key).ok_or_else(|| {
1037            anyhow!(
1038                "Sequence record not found for '{}' after loading",
1039                sequence_name
1040            )
1041        })
1042    }
1043
1044    /// Iterate over all collections with their sequences loaded.
1045    ///
1046    /// This loads all collection data upfront and returns an iterator over
1047    /// `SequenceCollection` objects with full sequence data.
1048    ///
1049    /// # Example
1050    /// ```ignore
1051    /// for collection in store.iter_collections() {
1052    ///     println!("{}: {} sequences", collection.metadata.digest, collection.sequences.len());
1053    /// }
1054    /// ```
1055    ///
1056    /// Note: For browsing without loading data, use `list_collections()` instead.
1057    pub fn iter_collections(&mut self) -> impl Iterator<Item = SequenceCollection> {
1058        // Collect digests first to avoid borrow issues
1059        let mut digests: Vec<String> = self
1060            .collections
1061            .values()
1062            .map(|rec| rec.metadata().digest.clone())
1063            .collect();
1064        digests.sort();
1065
1066        // Load each collection in sorted order
1067        let mut collections = Vec::new();
1068        for digest in digests {
1069            if let Ok(collection) = self.get_collection(&digest) {
1070                collections.push(collection);
1071            }
1072        }
1073        collections.into_iter()
1074    }
1075
1076    /// Iterate over all sequences with their data loaded.
1077    ///
1078    /// This ensures all sequence data is loaded and returns an iterator over
1079    /// `SequenceRecord` objects with full sequence data.
1080    ///
1081    /// # Example
1082    /// ```ignore
1083    /// for seq in store.iter_sequences() {
1084    ///     println!("{}: {}", seq.metadata().name, seq.decode().unwrap_or_default());
1085    /// }
1086    /// ```
1087    ///
1088    /// Note: For browsing without loading data, use `list_sequences()` instead.
1089    pub fn iter_sequences(&mut self) -> impl Iterator<Item = SequenceRecord> {
1090        // Collect keys first to avoid borrow issues
1091        let keys: Vec<[u8; 32]> = self.sequence_store.keys().cloned().collect();
1092
1093        // Load each sequence
1094        for key in &keys {
1095            let _ = self.ensure_sequence_loaded(key);
1096        }
1097
1098        // Return cloned records sorted by digest
1099        let mut records: Vec<_> = self.sequence_store.values().cloned().collect();
1100        records.sort_by(|a, b| a.metadata().sha512t24u.cmp(&b.metadata().sha512t24u));
1101        records.into_iter()
1102    }
1103
1104    /// Check if a collection is fully loaded (Full) or just metadata (Stub)
1105    pub fn is_collection_loaded<K: AsRef<[u8]>>(&self, collection_digest: K) -> bool {
1106        let key = collection_digest.to_key();
1107        self.collections
1108            .get(&key)
1109            .map_or(false, |record| record.has_sequences())
1110    }
1111
1112    /// Returns the local path where the store is located (if any)
1113    pub fn local_path(&self) -> Option<&PathBuf> {
1114        self.local_path.as_ref()
1115    }
1116
1117    /// Returns the remote source URL (if any)
1118    pub fn remote_source(&self) -> Option<&str> {
1119        self.remote_source.as_deref()
1120    }
1121
1122    /// Returns the storage mode used by this store
1123    pub fn storage_mode(&self) -> StorageMode {
1124        self.mode
1125    }
1126
1127    /// Get an iterator over substrings defined by BED file regions.
1128    ///
1129    /// Reads a BED file line-by-line and yields substrings for each region.
1130    /// This is memory-efficient for large BED files as it streams results.
1131    ///
1132    /// # Arguments
1133    /// * `collection_digest` - The collection digest containing the sequences
1134    /// * `bed_file_path` - Path to the BED file defining regions
1135    ///
1136    /// # Returns
1137    /// Iterator yielding `Result<RetrievedSequence>` for each BED region
1138    ///
1139    /// # Example
1140    /// ```ignore
1141    /// let iter = store.substrings_from_regions(digest, "regions.bed")?;
1142    /// for result in iter {
1143    ///     let seq = result?;
1144    ///     println!("{}:{}-{}: {}", seq.chrom_name, seq.start, seq.end, seq.sequence);
1145    /// }
1146    /// ```
1147    pub fn substrings_from_regions<'a, K: AsRef<[u8]>>(
1148        &'a mut self,
1149        collection_digest: K,
1150        bed_file_path: &str,
1151    ) -> Result<SubstringsFromRegions<'a, K>, Box<dyn std::error::Error>> {
1152        let path = Path::new(bed_file_path);
1153        let file_info = get_file_info(path);
1154        let is_gzipped = file_info.is_gzipped;
1155
1156        let opened_bed_file = File::open(path)?;
1157
1158        let reader: Box<dyn Read> = match is_gzipped {
1159            true => Box::new(GzDecoder::new(BufReader::new(opened_bed_file))),
1160            false => Box::new(opened_bed_file),
1161        };
1162        let reader = BufReader::new(reader);
1163
1164        Ok(SubstringsFromRegions {
1165            store: self,
1166            reader,
1167            collection_digest,
1168            previous_parsed_chr: String::new(),
1169            current_seq_digest: String::new(),
1170            line_num: 0,
1171        })
1172    }
1173
1174    /// Export sequences from BED file regions to a FASTA file.
1175    ///
1176    /// Reads a BED file defining genomic regions and exports the sequences
1177    /// for those regions to a FASTA file. This is useful for extracting
1178    /// specific regions of interest from a genome.
1179    ///
1180    /// # Arguments
1181    /// * `collection_digest` - The collection digest containing the sequences
1182    /// * `bed_file_path` - Path to the BED file defining regions
1183    /// * `output_file_path` - Path to write the output FASTA file
1184    ///
1185    /// # Returns
1186    /// Result indicating success or error
1187    ///
1188    /// # Example
1189    /// ```ignore
1190    /// store.export_fasta_from_regions(
1191    ///     digest,
1192    ///     "regions.bed",
1193    ///     "output.fa"
1194    /// )?;
1195    /// ```
1196    pub fn export_fasta_from_regions<K: AsRef<[u8]>>(
1197        &mut self,
1198        collection_digest: K,
1199        bed_file_path: &str,
1200        output_file_path: &str,
1201    ) -> Result<(), Box<dyn std::error::Error>> {
1202        // Set up the output path and create directories if they don't exist
1203        let output_path_obj = Path::new(output_file_path);
1204        if let Some(parent) = output_path_obj.parent() {
1205            create_dir_all(parent)?;
1206        }
1207
1208        // Create output file with optional gzip compression
1209        let file = File::create(output_file_path)?;
1210
1211        let mut writer: Box<dyn Write> = if output_path_obj.extension() == Some(OsStr::new("gz")) {
1212            Box::new(GzEncoder::new(file, Compression::default()))
1213        } else {
1214            Box::new(file)
1215        };
1216
1217        // Pre-fetch all sequence metadata from the collection to avoid borrowing issues
1218        let collection_key = collection_digest.as_ref().to_key();
1219
1220        // Ensure collection is loaded (populates name_lookup for lazy-loaded stores)
1221        self.ensure_collection_loaded(&collection_key)?;
1222
1223        let name_to_metadata: HashMap<String, (String, usize, AlphabetType, String, String)> = self
1224            .name_lookup
1225            .get(&collection_key)
1226            .map(|name_map| {
1227                name_map
1228                    .iter()
1229                    .filter_map(|(name, seq_digest)| {
1230                        self.sequence_store.get(seq_digest).map(|record| {
1231                            let metadata = record.metadata();
1232                            (
1233                                name.clone(),
1234                                (
1235                                    metadata.name.clone(),
1236                                    metadata.length,
1237                                    metadata.alphabet,
1238                                    metadata.sha512t24u.clone(),
1239                                    metadata.md5.clone(),
1240                                ),
1241                            )
1242                        })
1243                    })
1244                    .collect()
1245            })
1246            .unwrap_or_default();
1247
1248        let seq_iter = self.substrings_from_regions(&collection_digest, bed_file_path)?;
1249
1250        let mut previous_parsed_chr = String::new();
1251        let mut current_header: String = String::new();
1252        let mut previous_header: String = String::new();
1253
1254        for rs in seq_iter.into_iter() {
1255            let rs = rs?;
1256
1257            if previous_parsed_chr != rs.chrom_name {
1258                previous_parsed_chr = rs.chrom_name.clone();
1259
1260                // Look up metadata from our pre-fetched map
1261                if let Some((name, length, alphabet, sha512, md5)) =
1262                    name_to_metadata.get(&rs.chrom_name)
1263                {
1264                    current_header =
1265                        format!(">{} {} {} {} {}", name, length, alphabet, sha512, md5);
1266                }
1267            }
1268
1269            let retrieved_substring = rs.sequence;
1270
1271            if previous_header != current_header {
1272                let prefix = if previous_header.is_empty() { "" } else { "\n" };
1273
1274                previous_header = current_header.clone();
1275
1276                // Combine the prefix, current_header, and a trailing newline
1277                let header_to_be_written = format!("{}{}\n", prefix, current_header);
1278                writer.write_all(header_to_be_written.as_bytes())?;
1279            }
1280
1281            writer.write_all(retrieved_substring.as_ref())?;
1282        }
1283
1284        // Ensure all data is flushed (important for gzip)
1285        writer.flush()?;
1286
1287        Ok(())
1288    }
1289
1290    /// Retrieves a substring from an encoded sequence by its SHA512t24u digest.
1291    ///
1292    /// # Arguments
1293    ///
1294    /// * `sha512_digest` - The SHA512t24u digest of the sequence
1295    /// * `start` - The start index of the substring (inclusive)
1296    /// * `end` - The end index of the substring (exclusive)
1297    ///
1298    /// # Returns
1299    ///
1300    /// The substring if the sequence is found, or an error if not found or invalid range
1301    pub fn get_substring<K: AsRef<[u8]>>(
1302        &mut self,
1303        sha512_digest: K,
1304        start: usize,
1305        end: usize,
1306    ) -> Result<String> {
1307        let digest_key = sha512_digest.to_key();
1308
1309        // Ensure the sequence data is loaded
1310        self.ensure_sequence_loaded(&digest_key)?;
1311
1312        let record = self.sequence_store.get(&digest_key).ok_or_else(|| {
1313            anyhow!(
1314                "Sequence not found: {}",
1315                String::from_utf8_lossy(sha512_digest.as_ref())
1316            )
1317        })?;
1318        let (metadata, sequence) = match record {
1319            SequenceRecord::Stub(_) => return Err(anyhow!("Sequence data not loaded (stub only)")),
1320            SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
1321        };
1322
1323        if start >= metadata.length || end > metadata.length || start >= end {
1324            return Err(anyhow!(
1325                "Invalid substring range: start={}, end={}, sequence length={}",
1326                start,
1327                end,
1328                metadata.length
1329            ));
1330        }
1331
1332        match self.mode {
1333            StorageMode::Encoded => {
1334                let alphabet = lookup_alphabet(&metadata.alphabet);
1335                let decoded_sequence = decode_substring_from_bytes(sequence, start, end, alphabet);
1336                String::from_utf8(decoded_sequence)
1337                    .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
1338            }
1339            StorageMode::Raw => {
1340                let raw_slice: &[u8] = &sequence[start..end];
1341                String::from_utf8(raw_slice.to_vec())
1342                    .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
1343            }
1344        }
1345    }
1346
1347    /// Export sequences from a collection to a FASTA file
1348    ///
1349    /// # Arguments
1350    /// * `collection_digest` - The digest of the collection to export from
1351    /// * `output_path` - Path to write the FASTA file
1352    /// * `sequence_names` - Optional list of sequence names to export.
1353    ///                      If None, exports all sequences in the collection.
1354    /// * `line_width` - Optional line width for wrapping sequences (default: 80)
1355    ///
1356    /// # Returns
1357    /// Result indicating success or error
1358    pub fn export_fasta<K: AsRef<[u8]>, P: AsRef<Path>>(
1359        &mut self,
1360        collection_digest: K,
1361        output_path: P,
1362        sequence_names: Option<Vec<&str>>,
1363        line_width: Option<usize>,
1364    ) -> Result<()> {
1365        let line_width = line_width.unwrap_or(80);
1366        let output_path = output_path.as_ref();
1367        let collection_key = collection_digest.as_ref().to_key();
1368
1369        // Ensure collection is loaded (populates name_lookup for lazy-loaded stores)
1370        self.ensure_collection_loaded(&collection_key)?;
1371
1372        // Get the name map for this collection and build a map of name -> digest
1373        let name_to_digest: HashMap<String, [u8; 32]> = self
1374            .name_lookup
1375            .get(&collection_key)
1376            .ok_or_else(|| {
1377                anyhow!(
1378                    "Collection not found: {:?}",
1379                    String::from_utf8_lossy(collection_digest.as_ref())
1380                )
1381            })?
1382            .clone();
1383
1384        // Determine which sequences to export
1385        let names_to_export: Vec<String> = if let Some(names) = sequence_names {
1386            // Filter to only requested names
1387            names.iter().map(|s| s.to_string()).collect()
1388        } else {
1389            // Export all sequences in the collection
1390            name_to_digest.keys().cloned().collect()
1391        };
1392
1393        // Create output file with optional gzip compression
1394        let file = File::create(output_path).context(format!(
1395            "Failed to create output file: {}",
1396            output_path.display()
1397        ))?;
1398
1399        let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
1400            Box::new(GzEncoder::new(file, Compression::default()))
1401        } else {
1402            Box::new(file)
1403        };
1404
1405        // Export each sequence
1406        for seq_name in names_to_export {
1407            // Get the sequence digest from the name map
1408            let seq_digest = name_to_digest
1409                .get(&seq_name)
1410                .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", seq_name))?;
1411
1412            // Ensure sequence is loaded
1413            self.ensure_sequence_loaded(seq_digest)?;
1414
1415            // Get the sequence record
1416            let record = self
1417                .sequence_store
1418                .get(seq_digest)
1419                .ok_or_else(|| anyhow!("Sequence record not found for digest: {:?}", seq_digest))?;
1420
1421            // Get the sequence data
1422            let (metadata, sequence_data) = match record {
1423                SequenceRecord::Stub(_) => {
1424                    return Err(anyhow!("Sequence data not loaded for '{}'", seq_name));
1425                }
1426                SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
1427            };
1428
1429            // Decode the sequence based on storage mode
1430            let decoded_sequence = match self.mode {
1431                StorageMode::Encoded => {
1432                    let alphabet = lookup_alphabet(&metadata.alphabet);
1433                    let decoded =
1434                        decode_substring_from_bytes(sequence_data, 0, metadata.length, alphabet);
1435                    String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")?
1436                }
1437                StorageMode::Raw => String::from_utf8(sequence_data.clone())
1438                    .context("Failed to decode raw sequence as UTF-8")?,
1439            };
1440
1441            // Write FASTA header (include description if present)
1442            let header = match &metadata.description {
1443                Some(desc) => format!(">{} {}", metadata.name, desc),
1444                None => format!(">{}", metadata.name),
1445            };
1446            writeln!(writer, "{}", header)?;
1447
1448            // Write sequence with line wrapping
1449            for chunk in decoded_sequence.as_bytes().chunks(line_width) {
1450                writer.write_all(chunk)?;
1451                writer.write_all(b"\n")?;
1452            }
1453        }
1454
1455        // Ensure all data is flushed (important for gzip)
1456        writer.flush()?;
1457
1458        Ok(())
1459    }
1460
1461    /// Export sequences by their sequence digests to a FASTA file
1462    ///
1463    /// Bypasses collection information and exports sequences directly via sequence digests.
1464    /// # Arguments
1465    /// * `seq_digests` - List of SHA512t24u sequence digests (not collection digests) to export
1466    /// * `output_path` - Path to write the FASTA file
1467    /// * `line_width` - Optional line width for wrapping sequences (default: 80)
1468    ///
1469    /// # Returns
1470    /// Result indicating success or error
1471    pub fn export_fasta_by_digests<P: AsRef<Path>>(
1472        &mut self,
1473        seq_digests: Vec<&str>,
1474        output_path: P,
1475        line_width: Option<usize>,
1476    ) -> Result<()> {
1477        let line_width = line_width.unwrap_or(80);
1478        let output_path = output_path.as_ref();
1479
1480        // Create output file with optional gzip compression
1481        let file = File::create(output_path).context(format!(
1482            "Failed to create output file: {}",
1483            output_path.display()
1484        ))?;
1485
1486        let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
1487            Box::new(GzEncoder::new(file, Compression::default()))
1488        } else {
1489            Box::new(file)
1490        };
1491
1492        // Export each sequence
1493        for digest_str in seq_digests {
1494            let digest_key = digest_str.as_bytes().to_key();
1495
1496            // Ensure sequence is loaded
1497            self.ensure_sequence_loaded(&digest_key)?;
1498
1499            // Get the sequence record
1500            let record = self
1501                .sequence_store
1502                .get(&digest_key)
1503                .ok_or_else(|| anyhow!("Sequence record not found for digest: {}", digest_str))?;
1504
1505            // Get the sequence data
1506            let (metadata, sequence_data) = match record {
1507                SequenceRecord::Stub(_) => {
1508                    return Err(anyhow!(
1509                        "Sequence data not loaded for digest: {}",
1510                        digest_str
1511                    ));
1512                }
1513                SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
1514            };
1515
1516            // Decode the sequence based on storage mode
1517            let decoded_sequence = match self.mode {
1518                StorageMode::Encoded => {
1519                    let alphabet = lookup_alphabet(&metadata.alphabet);
1520                    let decoded =
1521                        decode_substring_from_bytes(sequence_data, 0, metadata.length, alphabet);
1522                    String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")?
1523                }
1524                StorageMode::Raw => String::from_utf8(sequence_data.clone())
1525                    .context("Failed to decode raw sequence as UTF-8")?,
1526            };
1527
1528            // Write FASTA header (include description if present)
1529            let header = match &metadata.description {
1530                Some(desc) => format!(">{} {}", metadata.name, desc),
1531                None => format!(">{}", metadata.name),
1532            };
1533            writeln!(writer, "{}", header)?;
1534
1535            // Write sequence with line wrapping
1536            for chunk in decoded_sequence.as_bytes().chunks(line_width) {
1537                writer.write_all(chunk)?;
1538                writer.write_all(b"\n")?;
1539            }
1540        }
1541
1542        // Ensure all data is flushed (important for gzip)
1543        writer.flush()?;
1544
1545        Ok(())
1546    }
1547
1548    /// Helper function to get the relative path for a sequence based on its SHA512t24u digest string
1549    fn get_sequence_path(digest_str: &str, template: &str) -> PathBuf {
1550        let path_str = template
1551            .replace("%s2", &digest_str[0..2])
1552            .replace("%s", digest_str);
1553
1554        PathBuf::from(path_str)
1555    }
1556
1557    /// Write a single sequence to disk using the configured path template
1558    fn write_sequence_to_disk_single(
1559        &self,
1560        metadata: &SequenceMetadata,
1561        sequence: &[u8],
1562    ) -> Result<()> {
1563        let template = self
1564            .seqdata_path_template
1565            .as_ref()
1566            .context("seqdata_path_template not set")?;
1567        let local_path = self.local_path.as_ref().context("local_path not set")?;
1568
1569        // Build path using template
1570        let seq_file_path = Self::get_sequence_path(&metadata.sha512t24u, template);
1571        let full_path = local_path.join(&seq_file_path);
1572
1573        // Create parent directory
1574        if let Some(parent) = full_path.parent() {
1575            create_dir_all(parent)?;
1576        }
1577
1578        // Write sequence data
1579        let mut file = File::create(&full_path)?;
1580        file.write_all(sequence)?;
1581
1582        Ok(())
1583    }
1584
1585    /// Write a single collection RGSI file to disk
1586    /// Used when persist_to_disk=true to persist collections incrementally
1587    fn write_collection_to_disk_single(&self, record: &SequenceCollectionRecord) -> Result<()> {
1588        let local_path = self.local_path.as_ref().context("local_path not set")?;
1589
1590        // Build path: collections/{digest}.rgsi
1591        let coll_file_path = format!("collections/{}.rgsi", record.metadata().digest);
1592        let full_path = local_path.join(&coll_file_path);
1593
1594        // Create parent directory
1595        if let Some(parent) = full_path.parent() {
1596            create_dir_all(parent)?;
1597        }
1598
1599        // Write collection RGSI file
1600        record.write_collection_rgsi(&full_path)?;
1601
1602        Ok(())
1603    }
1604
1605    /// Write index files (sequences.rgsi, collections.rgci, and rgstore.json) to disk
1606    ///
1607    /// This allows the store to be loaded later via load_local().
1608    /// Called automatically when adding collections in disk-backed mode.
1609    fn write_index_files(&self) -> Result<()> {
1610        let local_path = self.local_path.as_ref().context("local_path not set")?;
1611        let template = self
1612            .seqdata_path_template
1613            .as_ref()
1614            .context("seqdata_path_template not set")?;
1615
1616        // Write the sequence metadata index file (sequences.rgsi)
1617        let sequence_index_path = local_path.join("sequences.rgsi");
1618        self.write_sequences_rgsi(&sequence_index_path)?;
1619
1620        // Write the collection metadata index file (NEW)
1621        let collection_index_path = local_path.join("collections.rgci");
1622        self.write_collections_rgci(&collection_index_path)?;
1623
1624        // Create the metadata structure
1625        let metadata = StoreMetadata {
1626            version: 1,
1627            seqdata_path_template: template.clone(),
1628            collections_path_template: "collections/%s.rgsi".to_string(),
1629            sequence_index: "sequences.rgsi".to_string(),
1630            collection_index: Some("collections.rgci".to_string()),
1631            mode: self.mode,
1632            created_at: Utc::now().to_rfc3339(),
1633        };
1634
1635        // Write metadata to rgstore.json
1636        let json = serde_json::to_string_pretty(&metadata)
1637            .context("Failed to serialize metadata to JSON")?;
1638        fs::write(local_path.join("rgstore.json"), json).context("Failed to write rgstore.json")?;
1639
1640        Ok(())
1641    }
1642
1643    /// Write collection metadata index (collections.rgci) to disk
1644    ///
1645    /// Creates a master index of all collections with their metadata.
1646    /// Format: TSV with columns: digest, n_sequences, names_digest, sequences_digest, lengths_digest
1647    fn write_collections_rgci<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
1648        let file_path = file_path.as_ref();
1649        let mut file = File::create(file_path)?;
1650
1651        // Write header
1652        writeln!(
1653            file,
1654            "#digest\tn_sequences\tnames_digest\tsequences_digest\tlengths_digest"
1655        )?;
1656
1657        // Write collection metadata for all collections
1658        for record in self.collections.values() {
1659            let meta = record.metadata();
1660            writeln!(
1661                file,
1662                "{}\t{}\t{}\t{}\t{}",
1663                meta.digest,
1664                meta.n_sequences,
1665                meta.names_digest,
1666                meta.sequences_digest,
1667                meta.lengths_digest,
1668            )?;
1669        }
1670        Ok(())
1671    }
1672
1673    /// Write all sequence metadata to an RGSI file.
1674    ///
1675    /// Creates a global sequence index file containing metadata for all sequences
1676    /// in the store across all collections.
1677    pub fn write_sequences_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
1678        let file_path = file_path.as_ref();
1679        let mut file = std::fs::File::create(file_path)?;
1680
1681        // Write header with column names (6-column format, description at end)
1682        writeln!(
1683            file,
1684            "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
1685        )?;
1686
1687        // Write sequence metadata for all sequences
1688        for result_sr in self.sequence_store.values() {
1689            let result = result_sr.metadata().clone();
1690            let description = result.description.as_deref().unwrap_or("");
1691            writeln!(
1692                file,
1693                "{}\t{}\t{}\t{}\t{}\t{}",
1694                result.name,
1695                result.length,
1696                result.alphabet,
1697                result.sha512t24u,
1698                result.md5,
1699                description
1700            )?;
1701        }
1702        Ok(())
1703    }
1704
1705    /// Validate a relative path to prevent directory traversal attacks.
1706    /// Rejects absolute paths, paths with "..", and paths with null bytes.
1707    fn sanitize_relative_path(path: &str) -> Result<()> {
1708        if path.starts_with('/') || path.starts_with('\\') {
1709            return Err(anyhow!("Absolute paths not allowed: {}", path));
1710        }
1711        if path.contains("..") {
1712            return Err(anyhow!("Directory traversal not allowed: {}", path));
1713        }
1714        if path.contains('\0') {
1715            return Err(anyhow!("Null bytes not allowed in path"));
1716        }
1717        Ok(())
1718    }
1719
1720    /// Helper function to fetch a file from local path or remote source
1721    /// Returns the file contents as Vec<u8>
1722    fn fetch_file(
1723        local_path: &Option<PathBuf>,
1724        remote_source: &Option<String>,
1725        relative_path: &str,
1726        persist_to_disk: bool,
1727    ) -> Result<Vec<u8>> {
1728        // Validate the relative path to prevent directory traversal
1729        Self::sanitize_relative_path(relative_path)?;
1730
1731        // Check if file exists locally (only if caching is enabled and path exists)
1732        if persist_to_disk {
1733            if let Some(local_path) = local_path {
1734                let full_local_path = local_path.join(relative_path);
1735                if full_local_path.exists() {
1736                    return fs::read(&full_local_path).context(format!(
1737                        "Failed to read local file: {}",
1738                        full_local_path.display()
1739                    ));
1740                }
1741            }
1742        }
1743
1744        // If not local and we have a remote source, fetch from remote
1745        if let Some(remote_url) = remote_source {
1746            let full_remote_url = if remote_url.ends_with('/') {
1747                format!("{}{}", remote_url, relative_path)
1748            } else {
1749                format!("{}/{}", remote_url, relative_path)
1750            };
1751
1752            let response = ureq::get(&full_remote_url)
1753                .call()
1754                .map_err(|e| anyhow!("Failed to fetch from remote: {}", e))?;
1755
1756            let mut data = Vec::new();
1757            response
1758                .into_reader()
1759                .read_to_end(&mut data)
1760                .context("Failed to read response body")?;
1761
1762            // Save to local cache only if caching is enabled
1763            if persist_to_disk {
1764                if let Some(local_path) = local_path {
1765                    let full_local_path = local_path.join(relative_path);
1766
1767                    // Create parent directory if needed
1768                    if let Some(parent) = full_local_path.parent() {
1769                        create_dir_all(parent)?;
1770                    }
1771
1772                    // Save to disk
1773                    fs::write(&full_local_path, &data).context(format!(
1774                        "Failed to cache file to: {}",
1775                        full_local_path.display()
1776                    ))?;
1777                }
1778            }
1779
1780            Ok(data)
1781        } else {
1782            Err(anyhow!(
1783                "File not found locally and no remote source configured: {}",
1784                relative_path
1785            ))
1786        }
1787    }
1788
1789    /// Open a local RefgetStore from a directory.
1790    ///
1791    /// This loads only lightweight metadata and stubs. Collections and sequences
1792    /// remain as stubs until explicitly loaded with load_collection()/load_sequence().
1793    ///
1794    /// # Arguments
1795    /// * `path` - Path to the store directory
1796    ///
1797    /// Expects: rgstore.json, sequences.rgsi, collections.rgci, collections/*.rgsi
1798    pub fn open_local<P: AsRef<Path>>(path: P) -> Result<Self> {
1799        let root_path = path.as_ref();
1800
1801        // Read rgstore.json index file
1802        let index_path = root_path.join("rgstore.json");
1803        let json = fs::read_to_string(&index_path).context(format!(
1804            "Failed to read rgstore.json from {}",
1805            index_path.display()
1806        ))?;
1807
1808        let metadata: StoreMetadata =
1809            serde_json::from_str(&json).context("Failed to parse store metadata")?;
1810
1811        // Validate paths from metadata to prevent directory traversal
1812        Self::sanitize_relative_path(&metadata.seqdata_path_template)?;
1813        Self::sanitize_relative_path(&metadata.sequence_index)?;
1814        if let Some(ref ci) = metadata.collection_index {
1815            Self::sanitize_relative_path(ci)?;
1816        }
1817
1818        // Create a new empty store with the correct mode
1819        let mut store = RefgetStore::new(metadata.mode);
1820        store.local_path = Some(root_path.to_path_buf());
1821        store.seqdata_path_template = Some(metadata.seqdata_path_template.clone());
1822        store.persist_to_disk = true; // Local stores always use disk
1823
1824        // Load sequence metadata from the sequence index file (metadata only, no data)
1825        let sequence_index_path = root_path.join(&metadata.sequence_index);
1826        if sequence_index_path.exists() {
1827            Self::load_sequences_from_index(&mut store, &sequence_index_path)?;
1828        }
1829
1830        // Try to load collection stubs from collections.rgci (new format)
1831        if let Some(ref collection_index) = metadata.collection_index {
1832            let collection_index_path = root_path.join(collection_index);
1833            if collection_index_path.exists() {
1834                Self::load_collection_stubs_from_rgci(&mut store, &collection_index_path)?;
1835            }
1836        }
1837
1838        // If no collection stubs loaded (missing rgci), load full collections from directory
1839        if store.collections.is_empty() {
1840            let collections_dir = root_path.join("collections");
1841            Self::load_collections_from_directory(&mut store, &collections_dir)?;
1842        }
1843
1844        Ok(store)
1845    }
1846
1847    /// Load sequence metadata from a sequence index file (sequences.rgsi)
1848    fn load_sequences_from_index(store: &mut RefgetStore, index_path: &Path) -> Result<()> {
1849        let file = std::fs::File::open(index_path)?;
1850        let reader = std::io::BufReader::new(file);
1851
1852        for line in reader.lines() {
1853            let line = line?;
1854
1855            // Skip comment lines
1856            if line.starts_with('#') {
1857                continue;
1858            }
1859
1860            // Parse sequence metadata line
1861            if let Some(seq_metadata) = parse_rgsi_line(&line) {
1862                // Create a SequenceRecord with no data (lazy loading)
1863                let record = SequenceRecord::Stub(seq_metadata.clone());
1864
1865                // Add to store
1866                let sha512_key = seq_metadata.sha512t24u.to_key();
1867                store.sequence_store.insert(sha512_key, record);
1868
1869                // Add to MD5 lookup
1870                let md5_key = seq_metadata.md5.to_key();
1871                store.md5_lookup.insert(md5_key, sha512_key);
1872            }
1873        }
1874
1875        Ok(())
1876    }
1877
1878    /// Load collection stubs from collections.rgci index file (new format)
1879    fn load_collection_stubs_from_rgci(store: &mut RefgetStore, index_path: &Path) -> Result<()> {
1880        let file = std::fs::File::open(index_path)?;
1881        let reader = std::io::BufReader::new(file);
1882
1883        for line in reader.lines() {
1884            let line = line?;
1885
1886            if let Some(metadata) = parse_rgci_line(&line) {
1887                let key = metadata.digest.to_key();
1888                // Create a SequenceCollectionRecord::Stub (sequences not loaded)
1889                // Note: name_lookup is NOT populated for stubs - it will be populated
1890                // when the collection is loaded via ensure_collection_loaded()
1891                store
1892                    .collections
1893                    .insert(key, SequenceCollectionRecord::Stub(metadata));
1894            }
1895        }
1896
1897        Ok(())
1898    }
1899
1900    /// Load full collections from a collections directory (fallback when no RGCI exists).
1901    ///
1902    /// Reads all .rgsi files in the directory and loads them as Full collections.
1903    fn load_collections_from_directory(
1904        store: &mut RefgetStore,
1905        collections_dir: &Path,
1906    ) -> Result<()> {
1907        if !collections_dir.exists() {
1908            return Ok(());
1909        }
1910
1911        for entry in fs::read_dir(collections_dir)? {
1912            let entry = entry?;
1913            let path = entry.path();
1914
1915            if path.is_file() && path.extension() == Some(OsStr::new("rgsi")) {
1916                // Load the collection from the file
1917                let collection = read_rgsi_file(&path)?;
1918                let collection_digest = collection.metadata.digest.to_key();
1919
1920                // Convert to SequenceCollectionRecord::Full
1921                let record = SequenceCollectionRecord::from(collection.clone());
1922
1923                // Add collection record to store
1924                store.collections.insert(collection_digest, record);
1925
1926                // Build name lookup for this collection
1927                let mut name_map = HashMap::new();
1928                for sequence_record in &collection.sequences {
1929                    let metadata = sequence_record.metadata();
1930                    let sha512_key = metadata.sha512t24u.to_key();
1931                    name_map.insert(metadata.name.clone(), sha512_key);
1932                }
1933                store.name_lookup.insert(collection_digest, name_map);
1934            }
1935        }
1936
1937        Ok(())
1938    }
1939
1940    /// Open a remote RefgetStore with local caching.
1941    ///
1942    /// This loads only lightweight metadata and stubs from the remote URL.
1943    /// Data is fetched on-demand when load_collection()/load_sequence() is called.
1944    ///
1945    /// # Arguments
1946    /// * `cache_path` - Local directory for caching fetched data
1947    /// * `remote_url` - URL of the remote store
1948    ///
1949    /// # Notes
1950    /// By default, persistence is enabled (sequences are cached to disk).
1951    /// Call `disable_persistence()` after loading to keep only in memory.
1952    pub fn open_remote<P: AsRef<Path>, S: AsRef<str>>(
1953        cache_path: P,
1954        remote_url: S,
1955    ) -> Result<Self> {
1956        let cache_path = cache_path.as_ref();
1957        let remote_url = remote_url.as_ref().to_string();
1958
1959        // Create cache directory
1960        create_dir_all(cache_path)?;
1961
1962        // Fetch rgstore.json from remote
1963        let index_data = Self::fetch_file(
1964            &Some(cache_path.to_path_buf()),
1965            &Some(remote_url.clone()),
1966            "rgstore.json",
1967            true,
1968        )?;
1969
1970        let json =
1971            String::from_utf8(index_data).context("Store metadata contains invalid UTF-8")?;
1972
1973        let metadata: StoreMetadata =
1974            serde_json::from_str(&json).context("Failed to parse store metadata")?;
1975
1976        // Validate paths from metadata to prevent directory traversal
1977        Self::sanitize_relative_path(&metadata.seqdata_path_template)?;
1978        Self::sanitize_relative_path(&metadata.sequence_index)?;
1979        if let Some(ref ci) = metadata.collection_index {
1980            Self::sanitize_relative_path(ci)?;
1981        }
1982
1983        // Create a new empty store with the correct mode
1984        let mut store = RefgetStore::new(metadata.mode);
1985        store.local_path = Some(cache_path.to_path_buf());
1986        store.remote_source = Some(remote_url.clone());
1987        store.seqdata_path_template = Some(metadata.seqdata_path_template.clone());
1988        store.persist_to_disk = true; // Default to true; user can call disable_persistence() after
1989
1990        // Fetch sequence index from remote (always cache metadata - it's small)
1991        let sequence_index_data = Self::fetch_file(
1992            &Some(cache_path.to_path_buf()),
1993            &Some(remote_url.clone()),
1994            &metadata.sequence_index,
1995            true, // Always cache metadata
1996        )?;
1997        let sequence_index_str = String::from_utf8(sequence_index_data)
1998            .context("sequence index contains invalid UTF-8")?;
1999
2000        // Parse sequence metadata (metadata only, no data)
2001        for line in sequence_index_str.lines() {
2002            // Skip comment lines
2003            if line.starts_with('#') {
2004                continue;
2005            }
2006
2007            // Parse sequence metadata line
2008            if let Some(seq_metadata) = parse_rgsi_line(line) {
2009                // Create a SequenceRecord with no data (lazy loading)
2010                let record = SequenceRecord::Stub(seq_metadata.clone());
2011
2012                // Add to store
2013                let sha512_key = seq_metadata.sha512t24u.to_key();
2014                store.sequence_store.insert(sha512_key, record);
2015
2016                // Add to MD5 lookup
2017                let md5_key = seq_metadata.md5.to_key();
2018                store.md5_lookup.insert(md5_key, sha512_key);
2019            }
2020        }
2021
2022        // Try to fetch and load collection stubs from collections.rgci (new format)
2023        if let Some(ref collection_index) = metadata.collection_index {
2024            if let Ok(collection_index_data) = Self::fetch_file(
2025                &Some(cache_path.to_path_buf()),
2026                &Some(remote_url.clone()),
2027                collection_index,
2028                true,
2029            ) {
2030                let collection_index_str = String::from_utf8(collection_index_data)
2031                    .context("collection index contains invalid UTF-8")?;
2032
2033                // Parse collection stubs
2034                for line in collection_index_str.lines() {
2035                    if let Some(coll_metadata) = parse_rgci_line(line) {
2036                        let key = coll_metadata.digest.to_key();
2037                        store
2038                            .collections
2039                            .insert(key, SequenceCollectionRecord::Stub(coll_metadata));
2040                    }
2041                }
2042            }
2043        }
2044
2045        // If no collection stubs loaded, check for cached collections in local directory
2046        if store.collections.is_empty() {
2047            let local_collections_dir = cache_path.join("collections");
2048            create_dir_all(&local_collections_dir)?; // Ensure cache dir exists
2049            Self::load_collections_from_directory(&mut store, &local_collections_dir)?;
2050        }
2051
2052        Ok(store)
2053    }
2054
2055    /// Ensure a collection is loaded into the store
2056    /// If the collection is a Stub, try to fetch full data from local or remote
2057    /// and upgrade it to Full. Also builds name_lookup for the collection.
2058    fn ensure_collection_loaded(&mut self, collection_digest: &[u8; 32]) -> Result<()> {
2059        // Check if name_lookup is already populated for this collection
2060        if self.name_lookup.contains_key(collection_digest) {
2061            return Ok(());
2062        }
2063
2064        // Check if we have a Stub that needs to be loaded
2065        let needs_fetch = match self.collections.get(collection_digest) {
2066            Some(SequenceCollectionRecord::Stub(_)) => true,
2067            Some(SequenceCollectionRecord::Full { .. }) => false,
2068            None => true, // Not in collections at all, need to fetch
2069        };
2070
2071        if needs_fetch {
2072            // Get the digest string (either from Stub or from the key)
2073            let digest_str = if let Some(SequenceCollectionRecord::Stub(meta)) =
2074                self.collections.get(collection_digest)
2075            {
2076                meta.digest.clone()
2077            } else {
2078                String::from_utf8_lossy(collection_digest).to_string()
2079            };
2080
2081            let relative_path = format!("collections/{}.rgsi", digest_str);
2082
2083            // Fetch the collection file
2084            // Always cache metadata files (they're small), even when persist_to_disk is false
2085            if !self.quiet {
2086                let cached = self
2087                    .local_path
2088                    .as_ref()
2089                    .map(|p| p.join(&relative_path).exists())
2090                    .unwrap_or(false);
2091                let verb = if cached { "Loading" } else { "Downloading" };
2092                eprintln!("{} collection {}...", verb, digest_str);
2093            }
2094            let _collection_data =
2095                Self::fetch_file(&self.local_path, &self.remote_source, &relative_path, true)?;
2096
2097            // Read the collection from the cached file
2098            let local_path = self
2099                .local_path
2100                .as_ref()
2101                .ok_or_else(|| anyhow!("No local path configured"))?;
2102
2103            let collection_file_path = local_path.join(&relative_path);
2104
2105            let collection = read_rgsi_file(&collection_file_path)?;
2106
2107            // Verify the collection digest matches what we requested
2108            let loaded_digest = collection.metadata.digest.to_key();
2109            if loaded_digest != *collection_digest {
2110                return Err(anyhow!(
2111                    "Collection digest mismatch: expected {}, got {}",
2112                    String::from_utf8_lossy(collection_digest),
2113                    String::from_utf8_lossy(&loaded_digest)
2114                ));
2115            }
2116
2117            // Convert to SequenceCollectionRecord::Full and replace Stub if present
2118            let record = SequenceCollectionRecord::from(collection.clone());
2119
2120            // Add collection to store (replacing Stub if present)
2121            self.collections.insert(*collection_digest, record);
2122
2123            // Build name lookup and add sequences to sequence_store as Stubs
2124            let mut name_map = HashMap::new();
2125            for sequence_record in &collection.sequences {
2126                let metadata = sequence_record.metadata();
2127                let sha512_key = metadata.sha512t24u.to_key();
2128                name_map.insert(metadata.name.clone(), sha512_key);
2129
2130                // Add to sequence_store if not already present (as Stub for lazy loading)
2131                if !self.sequence_store.contains_key(&sha512_key) {
2132                    self.sequence_store
2133                        .insert(sha512_key, SequenceRecord::Stub(metadata.clone()));
2134                    // Also add MD5 lookup
2135                    let md5_key = metadata.md5.to_key();
2136                    self.md5_lookup.insert(md5_key, sha512_key);
2137                }
2138            }
2139            self.name_lookup.insert(*collection_digest, name_map);
2140        } else {
2141            // Collection is Full but name_lookup not built yet - build it now
2142            // First, collect the data we need to avoid borrow conflicts
2143            let sequences_data: Vec<(SequenceMetadata, [u8; 32], [u8; 32])> =
2144                if let Some(SequenceCollectionRecord::Full { sequences, .. }) =
2145                    self.collections.get(collection_digest)
2146                {
2147                    sequences
2148                        .iter()
2149                        .map(|seq| {
2150                            let metadata = seq.metadata().clone();
2151                            let sha512_key = metadata.sha512t24u.to_key();
2152                            let md5_key = metadata.md5.to_key();
2153                            (metadata, sha512_key, md5_key)
2154                        })
2155                        .collect()
2156                } else {
2157                    Vec::new()
2158                };
2159
2160            // Now build name_lookup and add sequences to sequence_store
2161            let mut name_map = HashMap::new();
2162            for (metadata, sha512_key, md5_key) in sequences_data {
2163                name_map.insert(metadata.name.clone(), sha512_key);
2164
2165                // Add to sequence_store if not already present
2166                if !self.sequence_store.contains_key(&sha512_key) {
2167                    self.sequence_store
2168                        .insert(sha512_key, SequenceRecord::Stub(metadata));
2169                    self.md5_lookup.insert(md5_key, sha512_key);
2170                }
2171            }
2172            self.name_lookup.insert(*collection_digest, name_map);
2173        }
2174
2175        Ok(())
2176    }
2177
2178    /// Ensure a sequence is loaded into memory
2179    /// If the sequence data is not already loaded, fetch it from local or remote
2180    fn ensure_sequence_loaded(&mut self, digest: &[u8; 32]) -> Result<()> {
2181        // Check if sequence exists
2182        let record = self
2183            .sequence_store
2184            .get(digest)
2185            .ok_or_else(|| anyhow!("Sequence not found in store"))?;
2186
2187        // If data is already loaded, return early
2188        if matches!(record, SequenceRecord::Full { .. }) {
2189            return Ok(());
2190        }
2191
2192        // Get the necessary information before borrowing mutably
2193        let digest_str = &record.metadata().sha512t24u;
2194        let template = self
2195            .seqdata_path_template
2196            .as_ref()
2197            .ok_or_else(|| anyhow!("No sequence data path template configured"))?;
2198
2199        // Build the relative path using the template
2200        let relative_path = template
2201            .replace("%s2", &digest_str[0..2])
2202            .replace("%s4", &digest_str[0..4])
2203            .replace("%s", digest_str);
2204
2205        // Fetch the sequence data
2206        // Use persist_to_disk flag - this is where memory-only mode saves disk I/O
2207        if !self.quiet {
2208            let cached = self
2209                .local_path
2210                .as_ref()
2211                .map(|p| p.join(&relative_path).exists())
2212                .unwrap_or(false);
2213            let verb = if cached { "Loading" } else { "Downloading" };
2214            eprintln!("{} sequence {}...", verb, digest_str);
2215        }
2216        let data = Self::fetch_file(
2217            &self.local_path,
2218            &self.remote_source,
2219            &relative_path,
2220            self.persist_to_disk,
2221        )?;
2222
2223        // Update the record with the loaded data (in-place, no clone needed)
2224        self.sequence_store.entry(*digest).and_modify(|r| {
2225            r.load_data(data);
2226        });
2227
2228        Ok(())
2229    }
2230
2231    /// Write all sequence metadata to an RGSI file (without collection headers).
2232    ///
2233    /// Creates a global sequence index file containing metadata for all sequences
2234    /// in the store across all collections. Does not include collection-level digest headers.
2235    ///
2236
2237    /// Write the store using its configured paths
2238    ///
2239    /// For disk-backed stores (on_disk), this updates index files only since
2240    /// sequences/collections are already written incrementally.
2241    /// For in-memory stores, this is not supported (use write_store_to_dir instead).
2242    ///
2243    /// # Returns
2244    /// Result indicating success or error
2245    ///
2246    /// # Errors
2247    /// Returns an error if `local_path` is not set.
2248    ///
2249    /// # Example
2250    /// ```ignore
2251    /// let store = RefgetStore::on_disk("/data/store")?;
2252    /// store.add_sequence_collection_from_fasta("genome.fa")?;
2253    /// store.write()?;  // Updates index files
2254    /// ```
2255    pub fn write(&self) -> Result<()> {
2256        if !self.persist_to_disk {
2257            return Err(anyhow!(
2258                "write() only works with disk-backed stores - use write_store_to_dir() instead"
2259            ));
2260        }
2261
2262        // For disk-backed stores, just update indexes (sequences/collections already written)
2263        self.write_index_files()
2264    }
2265
2266    /// Write a RefgetStore object to a directory
2267    pub fn write_store_to_dir<P: AsRef<Path>>(
2268        &self,
2269        root_path: P,
2270        seqdata_path_template: Option<&str>,
2271    ) -> Result<()> {
2272        let root_path = root_path.as_ref();
2273
2274        // Use provided template, or store's template, or default
2275        let template = seqdata_path_template
2276            .or(self.seqdata_path_template.as_deref())
2277            .unwrap_or(DEFAULT_SEQDATA_PATH_TEMPLATE);
2278
2279        println!(
2280            "Writing store to directory: {}; Using seqdata path template: {}",
2281            root_path.display(),
2282            template
2283        );
2284
2285        // Create the root directory if it doesn't exist
2286        fs::create_dir_all(root_path)?;
2287
2288        // Create sequences directory
2289        let sequences_dir = root_path.join("sequences");
2290        fs::create_dir_all(&sequences_dir)?;
2291
2292        // Create collections directory
2293        let collections_dir = root_path.join("collections");
2294        fs::create_dir_all(&collections_dir)?;
2295
2296        // Write each sequence to its own file
2297        for record in self.sequence_store.values() {
2298            match record {
2299                SequenceRecord::Full { metadata, .. } => {
2300                    // Get the path for this sequence using the template and base64url-encoded digest
2301                    let rel_path = Self::get_sequence_path(&metadata.sha512t24u, template);
2302                    let full_path = root_path.join(&rel_path);
2303
2304                    // Write the sequence data to file
2305                    record.to_file(full_path)?;
2306                }
2307                SequenceRecord::Stub(_metadata) => {
2308                    // Stub means sequence already on disk - skip writing
2309                    continue;
2310                }
2311            }
2312        }
2313
2314        // Write each collection to its own .rgsi file
2315        for record in self.collections.values() {
2316            let collection_file_path =
2317                root_path.join(format!("collections/{}.rgsi", record.metadata().digest));
2318            record.write_collection_rgsi(&collection_file_path)?;
2319        }
2320
2321        // Write the sequence metadata index file
2322        let sequence_index_path = root_path.join("sequences.rgsi");
2323        self.write_sequences_rgsi(&sequence_index_path)?;
2324
2325        // Write the collection metadata index file
2326        let collection_index_path = root_path.join("collections.rgci");
2327        self.write_collections_rgci(&collection_index_path)?;
2328
2329        // Create the metadata structure
2330        let metadata = StoreMetadata {
2331            version: 1,
2332            seqdata_path_template: template.to_string(),
2333            collections_path_template: "collections/%s.rgsi".to_string(),
2334            sequence_index: "sequences.rgsi".to_string(),
2335            collection_index: Some("collections.rgci".to_string()),
2336            mode: self.mode,
2337            created_at: Utc::now().to_rfc3339(),
2338        };
2339
2340        // Write metadata to rgstore.json
2341        let json = serde_json::to_string_pretty(&metadata)
2342            .context("Failed to serialize metadata to JSON")?;
2343        fs::write(root_path.join("rgstore.json"), json).context("Failed to write rgstore.json")?;
2344
2345        Ok(())
2346    }
2347
2348    /// Returns statistics about the store
2349    ///
2350    /// # Returns
2351    /// A tuple of (n_sequences, n_collections_loaded, storage_mode_str)
2352    ///
2353    /// Note: n_collections_loaded only reflects collections currently loaded in memory.
2354    /// For remote stores, collections are loaded on-demand when accessed.
2355    pub fn stats(&self) -> (usize, usize, &'static str) {
2356        let n_sequences = self.sequence_store.len();
2357        let n_collections_loaded = self
2358            .collections
2359            .values()
2360            .filter(|record| record.has_sequences())
2361            .count();
2362        let mode_str = match self.mode {
2363            StorageMode::Raw => "Raw",
2364            StorageMode::Encoded => "Encoded",
2365        };
2366        (n_sequences, n_collections_loaded, mode_str)
2367    }
2368
2369    /// Extended statistics including stub/loaded breakdown for collections
2370    pub fn stats_extended(&self) -> StoreStats {
2371        let n_sequences = self.sequence_store.len();
2372        let n_sequences_loaded = self
2373            .sequence_store
2374            .values()
2375            .filter(|record| record.is_loaded())
2376            .count();
2377        let n_collections = self.collections.len();
2378        let n_collections_loaded = self
2379            .collections
2380            .values()
2381            .filter(|record| record.has_sequences())
2382            .count();
2383        let mode_str = match self.mode {
2384            StorageMode::Raw => "Raw",
2385            StorageMode::Encoded => "Encoded",
2386        };
2387        let total_disk_size = self.actual_disk_usage();
2388        StoreStats {
2389            n_sequences,
2390            n_sequences_loaded,
2391            n_collections,
2392            n_collections_loaded,
2393            storage_mode: mode_str.to_string(),
2394            total_disk_size,
2395        }
2396    }
2397}
2398
2399/// Extended statistics for a RefgetStore
2400#[derive(Debug, Clone)]
2401pub struct StoreStats {
2402    /// Total number of sequences (Stub + Full)
2403    pub n_sequences: usize,
2404    /// Number of sequences with data loaded (Full)
2405    pub n_sequences_loaded: usize,
2406    /// Total number of collections (Stub + Full)
2407    pub n_collections: usize,
2408    /// Number of collections with sequences loaded (Full)
2409    pub n_collections_loaded: usize,
2410    /// Storage mode (Raw or Encoded)
2411    pub storage_mode: String,
2412    /// Actual disk usage in bytes (all files in store directory)
2413    pub total_disk_size: usize,
2414}
2415
2416/// Format bytes into human-readable size (KB, MB, GB, etc.)
2417fn format_bytes(bytes: usize) -> String {
2418    const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
2419    let mut size = bytes as f64;
2420    let mut unit_idx = 0;
2421
2422    while size >= 1024.0 && unit_idx < UNITS.len() - 1 {
2423        size /= 1024.0;
2424        unit_idx += 1;
2425    }
2426
2427    if unit_idx == 0 {
2428        format!("{} {}", bytes, UNITS[0])
2429    } else {
2430        format!("{:.2} {}", size, UNITS[unit_idx])
2431    }
2432}
2433
2434impl Display for RefgetStore {
2435    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
2436        let total_size = self.total_disk_size();
2437        let size_str = format_bytes(total_size);
2438        writeln!(f, "SeqColStore object:")?;
2439        writeln!(f, "  Mode: {:?}", self.mode)?;
2440        writeln!(f, "  Disk size: {} ({} bytes)", size_str, total_size)?;
2441        writeln!(f, ">Sequences (n={}):", self.sequence_store.len())?;
2442        // Print out the sequences in the store
2443        for (i, (sha512_digest, sequence_record)) in self.sequence_store.iter().take(10).enumerate()
2444        {
2445            let metadata = sequence_record.metadata();
2446            let first_8_chars = match sequence_record {
2447                SequenceRecord::Stub(_) => "<stub>".to_string(),
2448                SequenceRecord::Full {
2449                    metadata,
2450                    sequence: seq,
2451                } => {
2452                    // Extract the first 8 characters of the sequence (or fewer if the sequence is shorter)
2453                    match self.mode {
2454                        StorageMode::Encoded => {
2455                            let alphabet = lookup_alphabet(&metadata.alphabet);
2456                            let decoded = decode_substring_from_bytes(
2457                                seq,
2458                                0,
2459                                8.min(metadata.length),
2460                                alphabet,
2461                            );
2462                            String::from_utf8(decoded).unwrap_or_else(|_| "???".to_string())
2463                        }
2464                        StorageMode::Raw => String::from_utf8(seq[0..8.min(seq.len())].to_vec())
2465                            .unwrap_or_else(|_| "???".to_string()),
2466                    }
2467                }
2468            };
2469
2470            writeln!(
2471                f,
2472                "   - {}. {:02x?}, MD5: {:02x?}, Length: {}, Alphabet: {:?}, Start: {}",
2473                i + 1,
2474                std::str::from_utf8(sha512_digest).unwrap(),
2475                &metadata.md5,
2476                &metadata.length,
2477                &metadata.alphabet,
2478                first_8_chars
2479            )?;
2480        }
2481        writeln!(f, ">Collections (n={:?}):", self.name_lookup.len())?;
2482        // Print out the collections in the store
2483        for (i, (digest, name_map)) in self.name_lookup.iter().enumerate() {
2484            // Convert the digest to a hex string
2485            let seqcol_digest_str = String::from_utf8_lossy(digest);
2486            writeln!(
2487                f,
2488                "  {}. Collection Digest: {:02x?} ({} sequences)",
2489                i + 1,
2490                seqcol_digest_str,
2491                name_map.len()
2492            )?;
2493            // Only show first 5 sequences in each collection
2494            for (name, sha512_digest) in name_map.iter().take(5) {
2495                // Convert the sha512_digest to a hex string
2496                let sha512_str = String::from_utf8_lossy(sha512_digest);
2497                writeln!(f, "   - Name: {}, SHA512: {:02x?}", name, sha512_str)?;
2498            }
2499            if name_map.len() > 5 {
2500                writeln!(f, "   - ... and {} more", name_map.len() - 5)?;
2501            }
2502        }
2503
2504        Ok(())
2505    }
2506}
2507
2508#[cfg(test)]
2509mod tests {
2510    use super::*;
2511    // use std::time::Instant;
2512    use crate::collection::{
2513        SequenceCollection, SequenceCollectionMetadata, SequenceMetadata, SequenceRecord,
2514    };
2515    use crate::digest::{md5, sha512t24u};
2516    use tempfile::tempdir;
2517
2518    // Note: FASTA→RGSI roundtrip testing is in fasta.rs::digests_fa_to_rgsi
2519
2520    // Helper function to calculate actual digests for testing
2521    fn calculate_test_digests(sequence: &[u8]) -> (String, String) {
2522        (sha512t24u(sequence), md5(sequence))
2523    }
2524
2525    /// Creates a test store with 3 sequences for export testing
2526    fn setup_export_test_store(temp_path: &std::path::Path) -> (RefgetStore, [u8; 32]) {
2527        let fasta_content = ">chr1\nATGCATGCATGC\n>chr2\nGGGGAAAA\n>chr3\nTTTTCCCC\n";
2528        let temp_fasta_path = temp_path.join("test.fa");
2529        fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
2530
2531        let mut store = RefgetStore::in_memory();
2532        store
2533            .add_sequence_collection_from_fasta(&temp_fasta_path)
2534            .unwrap();
2535
2536        let collections: Vec<_> = store.collections.keys().cloned().collect();
2537        let collection_digest = collections[0];
2538
2539        (store, collection_digest)
2540    }
2541
2542    #[test]
2543    fn test_mode_basics() {
2544        // Test default mode and convenience methods (no sequences needed)
2545        let mut store = RefgetStore::in_memory();
2546
2547        // Default is Encoded
2548        assert_eq!(store.mode, StorageMode::Encoded);
2549
2550        // Convenience methods
2551        store.disable_encoding();
2552        assert_eq!(store.mode, StorageMode::Raw);
2553        store.enable_encoding();
2554        assert_eq!(store.mode, StorageMode::Encoded);
2555
2556        // set_encoding_mode() also works
2557        store.set_encoding_mode(StorageMode::Raw);
2558        assert_eq!(store.mode, StorageMode::Raw);
2559        store.set_encoding_mode(StorageMode::Encoded);
2560        assert_eq!(store.mode, StorageMode::Encoded);
2561    }
2562
2563    #[test]
2564    fn test_mode_switching() {
2565        let temp_dir = tempdir().expect("Failed to create temporary directory");
2566        let temp_path = temp_dir.path();
2567        let fasta_content = ">chr1\nATGCATGCATGC\n>chr2\nGGGGAAAA\n";
2568        let temp_fasta_path = temp_path.join("test.fa");
2569        fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
2570
2571        let (chr1_sha, _) = calculate_test_digests(b"ATGCATGCATGC");
2572        let chr1_key = chr1_sha.as_bytes().to_key();
2573
2574        // Test Raw -> Encoded
2575        {
2576            let mut store = RefgetStore::in_memory();
2577            store.disable_encoding();
2578            store
2579                .add_sequence_collection_from_fasta(&temp_fasta_path)
2580                .unwrap();
2581
2582            // Verify raw (ASCII)
2583            if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2584            {
2585                assert_eq!(sequence, b"ATGCATGCATGC");
2586            }
2587            let seq_before = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2588
2589            // Switch to encoded
2590            store.set_encoding_mode(StorageMode::Encoded);
2591
2592            // Verify encoded (smaller)
2593            if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2594            {
2595                assert_eq!(sequence.len(), 3); // 12 bases * 2 bits = 3 bytes
2596            }
2597            let seq_after = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2598            assert_eq!(seq_before, seq_after);
2599        }
2600
2601        // Test Encoded -> Raw
2602        {
2603            let mut store = RefgetStore::in_memory();
2604            store
2605                .add_sequence_collection_from_fasta(&temp_fasta_path)
2606                .unwrap();
2607
2608            // Verify encoded
2609            if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2610            {
2611                assert_eq!(sequence.len(), 3);
2612            }
2613            let seq_before = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2614
2615            // Switch to raw
2616            store.disable_encoding();
2617
2618            // Verify raw (full size)
2619            if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2620            {
2621                assert_eq!(sequence, b"ATGCATGCATGC");
2622            }
2623            let seq_after = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2624            assert_eq!(seq_before, seq_after);
2625        }
2626    }
2627
2628    #[test]
2629    fn test_refget_store_retrieve_seq_and_vec() {
2630        // Create temporary directory for all test files
2631        let temp_dir = tempdir().expect("Failed to create temporary directory");
2632        let temp_path = temp_dir.path();
2633
2634        // --- 1. Prepare Test FASTA Data ---
2635        let fasta_content = "\
2636>chr1
2637ATGCATGCATGC
2638>chr2
2639GGGGAAAA
2640";
2641        let temp_fasta_path = temp_path.join("test.fa");
2642
2643        fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
2644
2645        // --- 2. Initialize RefgetStore and import FASTA ---
2646        let mut store = RefgetStore::in_memory();
2647        store
2648            .add_sequence_collection_from_fasta(&temp_fasta_path)
2649            .unwrap();
2650
2651        let sequence_keys: Vec<[u8; 32]> = store.sequence_store.keys().cloned().collect();
2652
2653        let _ = sequence_keys[0]; //ww1QMyfFm1f4qa3fRLqqJGafIeEuZR1V
2654        let _ = sequence_keys[1]; //OyXJErGtjgcIVSdobGkHE3sBdQ5faDTf
2655        let collection_digest_ref: &str = "uC_UorBNf3YUu1YIDainBhI94CedlNeH";
2656
2657        // Calculate expected SHA512t24u and MD5 for test sequences
2658        let (chr1_sha, chr1_md5) = calculate_test_digests(b"ATGCATGCATGC");
2659        let (chr2_sha, chr2_md5) = calculate_test_digests(b"GGGGAAAA");
2660        println!("chr1 values: {}  {}", chr1_sha, chr1_md5);
2661        println!("chr2 values: {}  {}", chr2_sha, chr2_md5);
2662
2663        // --- 3. Prepare Test BED Data ---
2664        // Use only valid entries for the success test
2665        let bed_content = "\
2666chr1\t0\t5
2667chr1\t8\t12
2668chr2\t0\t4
2669";
2670        let temp_bed_path = temp_path.join("test.bed");
2671
2672        fs::write(&temp_bed_path, bed_content).expect("Failed to write test BED file");
2673
2674        let temp_output_fa_path = temp_path.join("output.fa");
2675
2676        store
2677            .export_fasta_from_regions(
2678                collection_digest_ref,
2679                temp_bed_path.to_str().unwrap(),
2680                temp_output_fa_path.to_str().unwrap(),
2681            )
2682            .expect("export_fasta_from_regions failed");
2683
2684        // Read the output FASTA file and verify its content
2685        let output_fa_content =
2686            fs::read_to_string(&temp_output_fa_path).expect("Failed to read output FASTA file");
2687
2688        // Expected output content (headers and sequences should match the logic of the function)
2689        let expected_fa_content = format!(
2690            ">chr1 12 dna2bit {} {}\nATGCAATGC\n>chr2 8 dna2bit {} {}\nGGGG\n",
2691            chr1_sha, chr1_md5, chr2_sha, chr2_md5
2692        );
2693        assert_eq!(
2694            output_fa_content.trim(),
2695            expected_fa_content.trim(),
2696            "Output FASTA file content mismatch"
2697        );
2698        println!("✓ export_fasta_from_regions test passed.");
2699
2700        // --- Test substrings_from_regions iterator (returns iterator of RetrievedSequence) ---
2701        let vec_result: Vec<_> = store
2702            .substrings_from_regions(collection_digest_ref, temp_bed_path.to_str().unwrap())
2703            .expect("substrings_from_regions failed")
2704            .collect::<Result<Vec<_>, _>>()
2705            .expect("substrings_from_regions had errors");
2706
2707        // Define the expected vector of RetrievedSequence structs
2708        let expected_vec = vec![
2709            RetrievedSequence {
2710                sequence: "ATGCA".to_string(),
2711                chrom_name: "chr1".to_string(),
2712                start: 0,
2713                end: 5,
2714            },
2715            RetrievedSequence {
2716                sequence: "ATGC".to_string(),
2717                chrom_name: "chr1".to_string(),
2718                start: 8,
2719                end: 12,
2720            },
2721            RetrievedSequence {
2722                sequence: "GGGG".to_string(),
2723                chrom_name: "chr2".to_string(),
2724                start: 0,
2725                end: 4,
2726            },
2727        ];
2728
2729        // Assert that the returned vector matches the expected vector
2730        assert_eq!(
2731            vec_result, expected_vec,
2732            "Retrieved sequence vector mismatch"
2733        );
2734        println!("✓ substrings_from_regions test passed.");
2735    }
2736
2737    #[test]
2738    fn test_global_refget_store() {
2739        let sequence = b"ACGT";
2740        let name = "test_seq";
2741        println!("Testing RefgetStore with sequence: {}", name);
2742
2743        // Create a sequence collection
2744        let mut collection = SequenceCollection {
2745            metadata: SequenceCollectionMetadata {
2746                digest: "test_collection".to_string(),
2747                n_sequences: 0,
2748                names_digest: "test".to_string(),
2749                sequences_digest: "test".to_string(),
2750                lengths_digest: "test".to_string(),
2751                file_path: None,
2752            },
2753            sequences: Vec::new(),
2754        };
2755
2756        // Create a sequence record
2757        let seq_metadata = SequenceMetadata {
2758            name: name.to_string(),
2759            description: None,
2760            length: sequence.len(),
2761            sha512t24u: sha512t24u(sequence),
2762            md5: md5(sequence),
2763            alphabet: AlphabetType::Dna2bit,
2764            fai: None,
2765        };
2766
2767        let record = SequenceRecord::Full {
2768            metadata: seq_metadata.clone(),
2769            sequence: sequence.to_vec(),
2770        };
2771
2772        collection.sequences.push(record);
2773
2774        // Add the sequence to the store
2775        let mut store = RefgetStore::in_memory();
2776        store.add_sequence_collection(collection.clone()).unwrap();
2777
2778        // Verify the store has the sequence
2779        assert!(!store.sequence_store.is_empty());
2780
2781        // Test sequence lookup by collection+name (using string digest)
2782        let retrieved_by_name_str = store.get_sequence_by_name(&collection.metadata.digest, name);
2783        assert!(retrieved_by_name_str.is_ok());
2784        let retrieved_record = retrieved_by_name_str.unwrap();
2785        assert_eq!(retrieved_record.metadata().name, name);
2786        assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2787
2788        // Test sequence lookup by collection+name (using [u8; 32] digest)
2789        let retrieved_by_name_key =
2790            store.get_sequence_by_name(collection.metadata.digest.to_key(), name);
2791        assert!(retrieved_by_name_key.is_ok());
2792        let retrieved_record = retrieved_by_name_key.unwrap();
2793        assert_eq!(retrieved_record.metadata().name, name);
2794        assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2795
2796        // Test sequence lookup by SHA512 digest (using string)
2797        let retrieved_by_sha512_str = store.get_sequence(&seq_metadata.sha512t24u);
2798        assert!(retrieved_by_sha512_str.is_ok());
2799        let retrieved_record = retrieved_by_sha512_str.unwrap();
2800        assert_eq!(retrieved_record.metadata().name, name);
2801        assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2802
2803        // Test sequence lookup by SHA512 digest (using [u8; 32])
2804        let retrieved_by_sha512_key = store.get_sequence(seq_metadata.sha512t24u.to_key());
2805        assert!(retrieved_by_sha512_key.is_ok());
2806        let retrieved_record = retrieved_by_sha512_key.unwrap();
2807        assert_eq!(retrieved_record.metadata().name, name);
2808        assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2809    }
2810
2811    #[test]
2812    fn test_import_fasta() {
2813        let temp_dir = tempdir().expect("Failed to create temporary directory");
2814        let temp_path = temp_dir.path();
2815
2816        // Copy test FASTA file to temp directory
2817        let test_fa = "../tests/data/fasta/base.fa";
2818        let temp_fa = temp_path.join("base.fa");
2819
2820        std::fs::copy(test_fa, &temp_fa).expect("Failed to copy test FASTA file");
2821
2822        let mut store = RefgetStore::in_memory();
2823
2824        // Import the FASTA file
2825        store.add_sequence_collection_from_fasta(temp_fa).unwrap();
2826
2827        // Check that the store has sequences
2828        assert!(!store.sequence_store.is_empty());
2829
2830        // Try writing to a file
2831        let seq_template = "sequences/%s2/%s.seq";
2832        store
2833            .write_store_to_dir(temp_path.to_str().unwrap(), Some(seq_template))
2834            .unwrap();
2835    }
2836
2837    #[test]
2838    fn test_disk_persistence() {
2839        // Create a temporary directory for the test
2840        let temp_dir = tempdir().unwrap();
2841        let temp_path = temp_dir.path();
2842        let temp_fasta = temp_path.join("base.fa.gz");
2843        std::fs::copy("../tests/data/fasta/base.fa.gz", &temp_fasta)
2844            .expect("Failed to copy base.fa.gz to tempdir");
2845
2846        // Create a new sequence store
2847        let mut store = RefgetStore::in_memory();
2848
2849        // Import a FASTA file into the store
2850        // store.add_sequence_collection_from_fasta("../tests/data/subset.fa.gz").unwrap();
2851        store
2852            .add_sequence_collection_from_fasta(&temp_fasta)
2853            .unwrap();
2854
2855        // Get the sequence keys for verification (assuming we know the test file contains 3 sequences)
2856        let sequence_keys: Vec<[u8; 32]> = store.sequence_store.keys().cloned().collect();
2857        assert_eq!(
2858            sequence_keys.len(),
2859            3,
2860            "Test file should contain exactly 3 sequences"
2861        );
2862
2863        let sha512_key1 = sequence_keys[0];
2864        let sha512_key2 = sequence_keys[1];
2865
2866        // Store original sequences for comparison
2867        let original_seq1 = store.sequence_store.get(&sha512_key1).unwrap().clone();
2868        let original_seq2 = store.sequence_store.get(&sha512_key2).unwrap().clone();
2869
2870        // Write the store to the temporary directory
2871        let seq_template = "sequences/%s2/%s.seq";
2872        store
2873            .write_store_to_dir(temp_path, Some(seq_template))
2874            .unwrap();
2875
2876        // Verify that the files were created (using new names)
2877        assert!(temp_path.join("sequences").exists());
2878        assert!(temp_path.join("sequences").read_dir().unwrap().count() > 0);
2879        assert!(temp_path.join("rgstore.json").exists());
2880        assert!(temp_path.join("sequences.rgsi").exists());
2881        assert!(temp_path.join("collections.rgci").exists());
2882        assert!(temp_path.join("collections").exists());
2883
2884        // Load the store from disk
2885        let mut loaded_store = RefgetStore::open_local(temp_path).unwrap();
2886
2887        // Verify that the loaded store has the same sequences
2888        assert_eq!(loaded_store.sequence_store.len(), 3);
2889
2890        // Verify that we can retrieve sequences by their keys
2891        assert!(loaded_store.sequence_store.contains_key(&sha512_key1));
2892        assert!(loaded_store.sequence_store.contains_key(&sha512_key2));
2893
2894        // Verify the content of the sequences
2895        let loaded_seq1 = loaded_store.sequence_store.get(&sha512_key1).unwrap();
2896        let loaded_seq2 = loaded_store.sequence_store.get(&sha512_key2).unwrap();
2897
2898        // Check metadata equality
2899        assert_eq!(original_seq1.metadata().name, loaded_seq1.metadata().name);
2900        assert_eq!(
2901            original_seq1.metadata().length,
2902            loaded_seq1.metadata().length
2903        );
2904        assert_eq!(
2905            original_seq1.metadata().sha512t24u,
2906            loaded_seq1.metadata().sha512t24u
2907        );
2908        assert_eq!(original_seq1.metadata().md5, loaded_seq1.metadata().md5);
2909
2910        assert_eq!(original_seq2.metadata().name, loaded_seq2.metadata().name);
2911        assert_eq!(
2912            original_seq2.metadata().length,
2913            loaded_seq2.metadata().length
2914        );
2915        assert_eq!(
2916            original_seq2.metadata().sha512t24u,
2917            loaded_seq2.metadata().sha512t24u
2918        );
2919        assert_eq!(original_seq2.metadata().md5, loaded_seq2.metadata().md5);
2920
2921        // Check data is not loaded initially (lazy loading)
2922        assert_eq!(
2923            loaded_seq1.is_loaded(),
2924            false,
2925            "Data should not be loaded initially with lazy loading"
2926        );
2927        assert_eq!(
2928            loaded_seq2.is_loaded(),
2929            false,
2930            "Data should not be loaded initially with lazy loading"
2931        );
2932
2933        // Verify MD5 lookup is preserved
2934        assert_eq!(loaded_store.md5_lookup.len(), 3);
2935
2936        // Verify collections are preserved
2937        assert_eq!(loaded_store.collections.len(), store.collections.len());
2938
2939        // Test sequence retrieval functionality
2940        for (digest, original_record) in &store.sequence_store {
2941            let loaded_record = loaded_store.get_sequence(*digest).unwrap();
2942            assert_eq!(
2943                original_record.metadata().name,
2944                loaded_record.metadata().name
2945            );
2946            assert_eq!(
2947                original_record.metadata().length,
2948                loaded_record.metadata().length
2949            );
2950
2951            // Test substring retrieval works on loaded store
2952            if original_record.metadata().length > 0 {
2953                let substring_len = std::cmp::min(5, original_record.metadata().length);
2954                let substring = loaded_store.get_substring(digest, 0, substring_len);
2955                assert!(
2956                    substring.is_ok(),
2957                    "Should be able to retrieve substring from loaded sequence"
2958                );
2959            }
2960        }
2961
2962        println!("✓ Disk persistence test passed - all data preserved correctly");
2963    }
2964
2965    #[test]
2966    fn test_export_fasta_all_sequences() {
2967        let temp_dir = tempdir().expect("Failed to create temporary directory");
2968        let (mut store, collection_digest) = setup_export_test_store(temp_dir.path());
2969
2970        let output_path = temp_dir.path().join("exported_all.fa");
2971        store
2972            .export_fasta(&collection_digest, &output_path, None, Some(80))
2973            .unwrap();
2974
2975        let exported = fs::read_to_string(&output_path).unwrap();
2976        assert!(
2977            exported.contains(">chr1") && exported.contains(">chr2") && exported.contains(">chr3")
2978        );
2979        assert!(
2980            exported.contains("ATGCATGCATGC")
2981                && exported.contains("GGGGAAAA")
2982                && exported.contains("TTTTCCCC")
2983        );
2984    }
2985
2986    #[test]
2987    fn test_export_fasta_subset_sequences() {
2988        let temp_dir = tempdir().expect("Failed to create temporary directory");
2989        let (mut store, collection_digest) = setup_export_test_store(temp_dir.path());
2990
2991        let output_path = temp_dir.path().join("exported_subset.fa");
2992        store
2993            .export_fasta(
2994                &collection_digest,
2995                &output_path,
2996                Some(vec!["chr1", "chr3"]),
2997                Some(80),
2998            )
2999            .unwrap();
3000
3001        let exported = fs::read_to_string(&output_path).unwrap();
3002        assert!(exported.contains(">chr1") && exported.contains(">chr3"));
3003        assert!(!exported.contains(">chr2") && !exported.contains("GGGGAAAA"));
3004    }
3005
3006    #[test]
3007    fn test_export_fasta_roundtrip() {
3008        let temp_dir = tempdir().expect("Failed to create temporary directory");
3009        let temp_path = temp_dir.path();
3010
3011        // Create test FASTA with longer sequences
3012        let fasta_content = "\
3013>seq1
3014ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC
3015ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC
3016>seq2
3017GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGGAAAACCCC
3018";
3019        let temp_fasta_path = temp_path.join("original.fa");
3020        fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
3021
3022        // Import into store
3023        let mut store1 = RefgetStore::in_memory();
3024        store1
3025            .add_sequence_collection_from_fasta(&temp_fasta_path)
3026            .unwrap();
3027
3028        // Get original digests
3029        let original_digests: Vec<String> = store1
3030            .sequence_store
3031            .values()
3032            .map(|r| r.metadata().sha512t24u.clone())
3033            .collect();
3034
3035        // Export to new FASTA
3036        let collections: Vec<_> = store1.collections.keys().cloned().collect();
3037        let collection_digest = collections[0];
3038        let exported_path = temp_path.join("exported.fa");
3039        store1
3040            .export_fasta(&collection_digest, &exported_path, None, Some(60))
3041            .expect("Failed to export FASTA");
3042
3043        // Re-import the exported FASTA
3044        let mut store2 = RefgetStore::in_memory();
3045        store2
3046            .add_sequence_collection_from_fasta(&exported_path)
3047            .unwrap();
3048
3049        // Verify digests match (same sequences)
3050        let new_digests: Vec<String> = store2
3051            .sequence_store
3052            .values()
3053            .map(|r| r.metadata().sha512t24u.clone())
3054            .collect();
3055
3056        assert_eq!(
3057            original_digests.len(),
3058            new_digests.len(),
3059            "Should have same number of sequences"
3060        );
3061        for digest in original_digests {
3062            assert!(
3063                new_digests.contains(&digest),
3064                "Digest {} should be present after roundtrip",
3065                digest
3066            );
3067        }
3068
3069        println!("✓ Export/import roundtrip test passed");
3070    }
3071
3072    #[test]
3073    fn test_export_fasta_by_digests() {
3074        let temp_dir = tempdir().expect("Failed to create temporary directory");
3075        let (mut store, _) = setup_export_test_store(temp_dir.path());
3076
3077        let digests: Vec<String> = store
3078            .sequence_store
3079            .values()
3080            .map(|r| r.metadata().sha512t24u.clone())
3081            .collect();
3082        let digest_refs: Vec<&str> = digests.iter().map(|s| s.as_str()).collect();
3083
3084        let output_path = temp_dir.path().join("exported_by_digests.fa");
3085        store
3086            .export_fasta_by_digests(digest_refs, &output_path, Some(80))
3087            .unwrap();
3088
3089        let exported = fs::read_to_string(&output_path).unwrap();
3090        assert!(
3091            exported.contains(">chr1") && exported.contains(">chr2") && exported.contains(">chr3")
3092        );
3093    }
3094
3095    #[test]
3096    fn test_export_fasta_error_handling() {
3097        let temp_dir = tempdir().expect("Failed to create temporary directory");
3098        let (mut store, collection_digest) = setup_export_test_store(temp_dir.path());
3099
3100        let output_path = temp_dir.path().join("should_fail.fa");
3101
3102        // Test with non-existent collection
3103        let fake_collection = b"fake_collection_digest_12345678";
3104        assert!(
3105            store
3106                .export_fasta(fake_collection, &output_path, None, Some(80))
3107                .is_err()
3108        );
3109
3110        // Test with non-existent sequence name
3111        assert!(
3112            store
3113                .export_fasta(
3114                    &collection_digest,
3115                    &output_path,
3116                    Some(vec!["nonexistent_chr"]),
3117                    Some(80)
3118                )
3119                .is_err()
3120        );
3121    }
3122
3123    #[test]
3124    fn test_export_fasta_after_load_local() {
3125        // Test that export_fasta works on disk-loaded stores
3126        // This verifies the lazy loading fix (ensure_collection_loaded is called)
3127        let temp_dir = tempdir().expect("Failed to create temporary directory");
3128        let temp_path = temp_dir.path();
3129        let store_path = temp_path.join("store");
3130
3131        // Create test FASTA
3132        let fasta_content = ">chr1\nACGTACGT\n>chr2\nGGGGAAAA\n";
3133        let fasta_path = temp_path.join("test.fa");
3134        fs::write(&fasta_path, fasta_content).unwrap();
3135
3136        // Create and populate store on disk, save digest before closing
3137        let collection_digest: [u8; 32];
3138        {
3139            let mut store = RefgetStore::on_disk(&store_path).unwrap();
3140            store
3141                .add_sequence_collection_from_fasta(&fasta_path)
3142                .unwrap();
3143            let collections: Vec<_> = store.collections.keys().cloned().collect();
3144            assert_eq!(collections.len(), 1, "Should have exactly one collection");
3145            collection_digest = collections[0];
3146        } // store dropped here
3147
3148        // Load the store fresh from disk
3149        let mut loaded_store = RefgetStore::open_local(&store_path).unwrap();
3150
3151        // Verify collection is initially a stub (lazy loaded)
3152        assert!(
3153            !loaded_store.is_collection_loaded(&collection_digest),
3154            "Collection should be Stub after loading from disk"
3155        );
3156
3157        // This should work (was failing before fix due to missing ensure_collection_loaded call)
3158        let output_path = temp_path.join("exported.fa");
3159        loaded_store
3160            .export_fasta(&collection_digest, &output_path, None, Some(80))
3161            .expect("export_fasta should work on disk-loaded stores");
3162
3163        // Verify output
3164        let exported = fs::read_to_string(&output_path).unwrap();
3165        assert!(exported.contains(">chr1"));
3166        assert!(exported.contains("ACGTACGT"));
3167        assert!(exported.contains(">chr2"));
3168        assert!(exported.contains("GGGGAAAA"));
3169
3170        println!("✓ export_fasta after load_local test passed");
3171    }
3172
3173    #[test]
3174    fn test_sequence_names_with_spaces() {
3175        // Test FASTA header parsing: name is first word, rest is description
3176        // Following FASTA standard, we now split on whitespace
3177        let temp_dir = tempdir().expect("Failed to create temporary directory");
3178        let temp_path = temp_dir.path();
3179
3180        // Create test FASTA with headers containing descriptions after the ID
3181        // This mimics the structure from HPRC pangenome files
3182        let fasta_content = "\
3183>JAHKSE010000016.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000016.1:1:100:1
3184ATGCATGCATGCATGCATGCATGCATGCATGCATGC
3185ATGCATGCATGCATGCATGCATGCATGCATGCATGC
3186>JAHKSE010000012.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000012.1:1:100:1
3187GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGG
3188GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGG
3189";
3190        let temp_fasta_path = temp_path.join("spaces_in_names.fa");
3191        fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
3192
3193        // Import FASTA - headers will be split into name (first word) and description (rest)
3194        let mut store = RefgetStore::in_memory();
3195        store
3196            .add_sequence_collection_from_fasta(&temp_fasta_path)
3197            .expect("Should parse FASTA headers correctly");
3198
3199        // Verify the sequences were loaded
3200        assert_eq!(store.sequence_store.len(), 2);
3201
3202        // Names are now just the first word (before whitespace)
3203        let name1 = "JAHKSE010000016.1";
3204        let name2 = "JAHKSE010000012.1";
3205
3206        // Get the collection
3207        let collections: Vec<_> = store.collections.keys().cloned().collect();
3208        assert_eq!(collections.len(), 1);
3209        let collection_digest = collections[0];
3210
3211        // Verify we can retrieve sequences by their short names (first word only)
3212        // and check the description was captured
3213        {
3214            let seq1 = store.get_sequence_by_name(&collection_digest, name1);
3215            assert!(
3216                seq1.is_ok(),
3217                "Should retrieve sequence by name (first word)"
3218            );
3219
3220            let seq1_meta = seq1.unwrap().metadata();
3221            assert_eq!(seq1_meta.name, "JAHKSE010000016.1");
3222            assert_eq!(
3223                seq1_meta.description,
3224                Some(
3225                    "unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000016.1:1:100:1"
3226                        .to_string()
3227                )
3228            );
3229        }
3230
3231        {
3232            let seq2 = store.get_sequence_by_name(&collection_digest, name2);
3233            assert!(
3234                seq2.is_ok(),
3235                "Should retrieve sequence by name (first word)"
3236            );
3237        }
3238
3239        println!("✓ FASTA header parsing test passed");
3240    }
3241
3242    #[test]
3243    fn test_rgsi_filename_with_dots() {
3244        // Test that RGSI filenames preserve dots in the base name
3245        // Real HPRC files like "HG002.alt.pat.f1_v2.unmasked.fa.gz"
3246        // should create "HG002.alt.pat.f1_v2.unmasked.rgsi", NOT "HG002.rgsi"
3247
3248        let temp_dir = tempdir().expect("Failed to create temporary directory");
3249        let temp_path = temp_dir.path();
3250
3251        // Copy test file to temp (so .rgsi file gets created there, not in test data)
3252        let test_file = "../tests/data/fasta/HG002.alt.pat.f1_v2.unmasked.fa";
3253        let temp_fasta = temp_path.join("HG002.alt.pat.f1_v2.unmasked.fa");
3254        fs::copy(test_file, &temp_fasta).expect("Failed to copy test file");
3255
3256        // Load the FASTA - this creates a .rgsi file
3257        let mut store = RefgetStore::in_memory();
3258        store
3259            .add_sequence_collection_from_fasta(&temp_fasta)
3260            .expect("Should load FASTA");
3261
3262        // Check which .rgsi file was created
3263        let correct_rgsi = temp_path.join("HG002.alt.pat.f1_v2.unmasked.rgsi");
3264        let wrong_rgsi = temp_path.join("HG002.rgsi");
3265
3266        let files: Vec<_> = std::fs::read_dir(temp_path)
3267            .unwrap()
3268            .map(|e| e.unwrap().file_name().to_string_lossy().to_string())
3269            .collect();
3270
3271        assert!(
3272            correct_rgsi.exists(),
3273            "Expected 'HG002.alt.pat.f1_v2.unmasked.rgsi' but found: {:?}",
3274            files
3275        );
3276
3277        assert!(
3278            !wrong_rgsi.exists(),
3279            "Should NOT create 'HG002.rgsi' (strips too many dots)"
3280        );
3281
3282        println!("✓ RGSI filename with dots test passed");
3283    }
3284
3285    #[test]
3286    fn test_on_disk_collection_written_incrementally() {
3287        // Test that collection RGSI files are written to disk immediately
3288        // when using on_disk() store, not just when write_store_to_dir() is called
3289        let temp_dir = tempdir().unwrap();
3290        let temp_path = temp_dir.path();
3291        let temp_fasta = temp_path.join("base.fa.gz");
3292        std::fs::copy("../tests/data/fasta/base.fa.gz", &temp_fasta)
3293            .expect("Failed to copy base.fa.gz to tempdir");
3294
3295        let cache_path = temp_path.join("cache");
3296        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3297
3298        // Load FASTA file into the store
3299        store
3300            .add_sequence_collection_from_fasta(&temp_fasta)
3301            .unwrap();
3302
3303        // BEFORE calling write_store_to_dir, verify collection RGSI files exist
3304        let collections_dir = cache_path.join("collections");
3305        assert!(
3306            collections_dir.exists(),
3307            "Collections directory should exist"
3308        );
3309
3310        let rgsi_files: Vec<_> = std::fs::read_dir(&collections_dir)
3311            .unwrap()
3312            .map(|e| e.unwrap().file_name().to_string_lossy().to_string())
3313            .collect();
3314
3315        assert!(
3316            !rgsi_files.is_empty(),
3317            "Collection RGSI files should be written incrementally, found: {:?}",
3318            rgsi_files
3319        );
3320        assert!(
3321            rgsi_files.iter().any(|f| f.ends_with(".rgsi")),
3322            "Should have .rgsi files"
3323        );
3324
3325        println!("✓ On-disk collection incremental write test passed");
3326    }
3327
3328    #[test]
3329    fn test_disk_size_calculation() {
3330        let mut store = RefgetStore::in_memory();
3331        store
3332            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3333            .unwrap();
3334
3335        let disk_size = store.total_disk_size();
3336        assert!(disk_size > 0, "Disk size should be greater than 0");
3337
3338        // Verify against manual calculation
3339        let manual: usize = store
3340            .list_sequences()
3341            .iter()
3342            .map(|m| (m.length * m.alphabet.bits_per_symbol()).div_ceil(8))
3343            .sum();
3344        assert_eq!(disk_size, manual);
3345    }
3346
3347    #[test]
3348    fn test_incremental_index_writing() {
3349        let temp_dir = tempdir().unwrap();
3350        let cache_path = temp_dir.path().join("store");
3351        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3352
3353        store
3354            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3355            .unwrap();
3356
3357        // Index files should exist immediately (using new names)
3358        assert!(
3359            cache_path.join("rgstore.json").exists(),
3360            "rgstore.json should exist"
3361        );
3362        assert!(
3363            cache_path.join("sequences.rgsi").exists(),
3364            "sequences.rgsi should exist"
3365        );
3366        assert!(
3367            cache_path.join("collections.rgci").exists(),
3368            "collections.rgci should exist"
3369        );
3370
3371        // Store should be loadable (mode ignored for existing store)
3372        let _loaded = RefgetStore::on_disk(&cache_path).unwrap();
3373    }
3374
3375    #[test]
3376    fn test_write_method() {
3377        let temp_dir = tempdir().unwrap();
3378        let cache_path = temp_dir.path().join("store");
3379        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3380
3381        store
3382            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3383            .unwrap();
3384        store.write().unwrap(); // Should succeed
3385
3386        assert!(cache_path.join("rgstore.json").exists());
3387    }
3388
3389    #[test]
3390    fn test_on_disk_smart_constructor() {
3391        let temp_dir = tempdir().unwrap();
3392        let cache_path = temp_dir.path().join("store");
3393
3394        // Create new store (defaults to Encoded mode)
3395        let mut store1 = RefgetStore::on_disk(&cache_path).unwrap();
3396        assert_eq!(store1.mode, StorageMode::Encoded);
3397        store1
3398            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3399            .unwrap();
3400
3401        // Load existing store - should preserve Encoded mode
3402        let store2 = RefgetStore::on_disk(&cache_path).unwrap();
3403        assert_eq!(store2.sequence_store.len(), store1.sequence_store.len());
3404        assert_eq!(
3405            store2.mode,
3406            StorageMode::Encoded,
3407            "Loaded store should preserve Encoded mode"
3408        );
3409
3410        // Test with Raw mode
3411        let cache_path_raw = temp_dir.path().join("store_raw");
3412        let mut store3 = RefgetStore::on_disk(&cache_path_raw).unwrap();
3413        store3.disable_encoding(); // Switch to Raw
3414        assert_eq!(store3.mode, StorageMode::Raw);
3415        store3
3416            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3417            .unwrap();
3418
3419        // Load and verify Raw mode is persisted
3420        let store4 = RefgetStore::on_disk(&cache_path_raw).unwrap();
3421        assert_eq!(
3422            store4.mode,
3423            StorageMode::Raw,
3424            "Loaded store should preserve Raw mode"
3425        );
3426
3427        // Verify rgstore.json contains the mode
3428        let index_path = cache_path_raw.join("rgstore.json");
3429        let json = fs::read_to_string(&index_path).unwrap();
3430        assert!(
3431            json.contains("\"mode\":\"Raw\"") || json.contains("\"mode\": \"Raw\""),
3432            "rgstore.json should contain mode: Raw"
3433        );
3434    }
3435
3436    #[test]
3437    fn test_collection_metadata_methods() {
3438        // Test list_collections, get_collection_metadata, is_collection_loaded
3439        let temp_dir = tempdir().unwrap();
3440        let cache_path = temp_dir.path().join("store");
3441        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3442
3443        // Add a FASTA file
3444        store
3445            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3446            .unwrap();
3447
3448        // Test list_collections
3449        let collections = store.list_collections();
3450        assert_eq!(collections.len(), 1, "Should have 1 collection");
3451        let digest = collections[0].digest.clone();
3452
3453        // Test get_collection_metadata
3454        let meta = store.get_collection_metadata(&digest);
3455        assert!(meta.is_some(), "Should get collection metadata");
3456        let meta = meta.unwrap();
3457        assert_eq!(meta.n_sequences, 3, "Collection should have 3 sequences");
3458
3459        // Test is_collection_loaded - should be true since we just added it
3460        assert!(
3461            store.is_collection_loaded(&digest),
3462            "Collection should be loaded (Full)"
3463        );
3464
3465        // Test stats_extended returns collection counts
3466        let stats = store.stats_extended();
3467        assert_eq!(stats.n_collections, 1, "Should have 1 collection total");
3468        assert_eq!(
3469            stats.n_collections_loaded, 1,
3470            "Should have 1 collection loaded"
3471        );
3472        assert_eq!(stats.n_sequences, 3, "Should have 3 sequences");
3473
3474        println!("✓ Collection metadata methods test passed");
3475    }
3476
3477    #[test]
3478    fn test_collection_stub_lazy_loading() {
3479        // Test that collections load as Stubs and upgrade to Full on-demand
3480        let temp_dir = tempdir().unwrap();
3481        let cache_path = temp_dir.path().join("store");
3482
3483        // Create and populate the store
3484        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3485        store
3486            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3487            .unwrap();
3488        let digest = store.list_collections()[0].digest.clone();
3489
3490        // Drop the store and reload from disk
3491        drop(store);
3492        let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3493
3494        // VERIFY: Metadata is available (from collections.rgci)
3495        let meta = loaded_store.get_collection_metadata(&digest);
3496        assert!(meta.is_some(), "Metadata should be available for Stub");
3497        assert_eq!(
3498            meta.unwrap().n_sequences,
3499            3,
3500            "Stub should know sequence count"
3501        );
3502
3503        // VERIFY: Collection is a Stub (not loaded into memory)
3504        assert!(
3505            !loaded_store.is_collection_loaded(&digest),
3506            "Collection should be Stub after loading from disk"
3507        );
3508
3509        // VERIFY: stats shows 0 collections loaded
3510        let stats_before = loaded_store.stats_extended();
3511        assert_eq!(
3512            stats_before.n_collections, 1,
3513            "Should have 1 collection total"
3514        );
3515        assert_eq!(
3516            stats_before.n_collections_loaded, 0,
3517            "Should have 0 collections loaded initially"
3518        );
3519
3520        // TRIGGER: Access a sequence by name - this should trigger lazy loading
3521        let seq = loaded_store.get_sequence_by_name(&digest, "chr1");
3522        assert!(
3523            seq.is_ok(),
3524            "Should be able to retrieve sequence after lazy load"
3525        );
3526        assert_eq!(seq.unwrap().metadata().name, "chr1");
3527
3528        // VERIFY: Collection is now Full (loaded into memory)
3529        assert!(
3530            loaded_store.is_collection_loaded(&digest),
3531            "Collection should be Full after accessing a sequence"
3532        );
3533
3534        // VERIFY: stats now shows 1 collection loaded
3535        let stats_after = loaded_store.stats_extended();
3536        assert_eq!(
3537            stats_after.n_collections_loaded, 1,
3538            "Should have 1 collection loaded after access"
3539        );
3540
3541        println!("✓ Collection stub lazy loading test passed");
3542    }
3543
3544    // Note: open_local is tested in test_disk_persistence which is more comprehensive
3545
3546    #[test]
3547    fn test_get_collection() {
3548        // Test the get_collection method (returns collection with sequence metadata, lazy loading)
3549        let temp_dir = tempdir().unwrap();
3550        let cache_path = temp_dir.path().join("store");
3551
3552        // Create and populate the store
3553        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3554        store
3555            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3556            .unwrap();
3557        let digest = store.list_collections()[0].digest.clone();
3558        drop(store);
3559
3560        // Reload and test get_collection
3561        let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3562
3563        // Before loading - collection is a Stub
3564        assert!(!loaded_store.is_collection_loaded(&digest));
3565
3566        // Before loading - no sequences loaded
3567        let stats_before = loaded_store.stats_extended();
3568        assert_eq!(
3569            stats_before.n_sequences_loaded, 0,
3570            "No sequences should be loaded initially"
3571        );
3572
3573        // Get the collection (loads metadata only, sequences are lazy)
3574        let collection = loaded_store.get_collection(&digest).unwrap();
3575        assert!(
3576            !collection.sequences.is_empty(),
3577            "Collection should have sequences"
3578        );
3579        assert_eq!(collection.sequences.len(), 3);
3580
3581        // After get_collection - collection is loaded but sequences are still stubs (lazy loading)
3582        let stats_after = loaded_store.stats_extended();
3583        assert_eq!(
3584            stats_after.n_sequences_loaded, 0,
3585            "Sequences not loaded until explicitly fetched"
3586        );
3587        assert_eq!(
3588            stats_after.n_collections_loaded, 1,
3589            "Collection should be loaded"
3590        );
3591
3592        // Verify sequences are stubs (not loaded)
3593        for record in loaded_store.sequence_store.values() {
3594            assert!(
3595                !record.is_loaded(),
3596                "Sequences should be stubs after get_collection"
3597            );
3598        }
3599
3600        // Now explicitly load a sequence
3601        let seq_digest = collection.sequences[0].metadata().sha512t24u.clone();
3602        let loaded_seq = loaded_store.get_sequence(&seq_digest).unwrap();
3603        assert!(
3604            loaded_seq.is_loaded(),
3605            "Sequence should be loaded after get_sequence"
3606        );
3607
3608        println!("✓ get_collection test passed");
3609    }
3610
3611    #[test]
3612    fn test_get_sequence() {
3613        // Test the get_sequence method (loads sequence on demand)
3614        let temp_dir = tempdir().unwrap();
3615        let cache_path = temp_dir.path().join("store");
3616
3617        // Create and populate the store
3618        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3619        store
3620            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3621            .unwrap();
3622
3623        // Get a sequence digest from the sequence_store
3624        let seq_digest = store
3625            .sequence_store
3626            .values()
3627            .next()
3628            .unwrap()
3629            .metadata()
3630            .sha512t24u
3631            .clone();
3632        drop(store);
3633
3634        // Reload and test get_sequence
3635        let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3636
3637        // Before loading - sequence is a Stub (no data)
3638        let seq_before = loaded_store
3639            .sequence_store
3640            .get(&seq_digest.to_key())
3641            .unwrap();
3642        assert!(
3643            !seq_before.is_loaded(),
3644            "Sequence should not have data before get_sequence"
3645        );
3646
3647        // Get the sequence (loads data on demand)
3648        let loaded_seq = loaded_store.get_sequence(&seq_digest).unwrap();
3649        assert!(
3650            loaded_seq.is_loaded(),
3651            "Sequence should have data after get_sequence"
3652        );
3653        assert!(
3654            loaded_seq.sequence().is_some(),
3655            "Sequence data should be available"
3656        );
3657
3658        println!("✓ get_sequence test passed");
3659    }
3660
3661    #[test]
3662    fn test_get_collection_idempotent() {
3663        // Test that calling get_collection twice is safe (idempotent)
3664        let temp_dir = tempdir().unwrap();
3665        let cache_path = temp_dir.path().join("store");
3666
3667        // Create and populate the store
3668        let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3669        store
3670            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3671            .unwrap();
3672        let digest = store.list_collections()[0].digest.clone();
3673        drop(store);
3674
3675        // Reload and test idempotent loading
3676        let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3677
3678        // Get twice - both should succeed
3679        let result1 = loaded_store.get_collection(&digest);
3680        assert!(result1.is_ok(), "First get should succeed");
3681
3682        let result2 = loaded_store.get_collection(&digest);
3683        assert!(result2.is_ok(), "Second get should also succeed");
3684
3685        // Store state should be unchanged after second get
3686        assert_eq!(loaded_store.stats_extended().n_collections_loaded, 1);
3687
3688        println!("✓ get_collection idempotent test passed");
3689    }
3690
3691    #[test]
3692    fn test_sanitize_relative_path_rejects_traversal() {
3693        assert!(RefgetStore::sanitize_relative_path("../etc/passwd").is_err());
3694        assert!(RefgetStore::sanitize_relative_path("foo/../bar").is_err());
3695        assert!(RefgetStore::sanitize_relative_path("foo/../../bar").is_err());
3696        assert!(RefgetStore::sanitize_relative_path("..").is_err());
3697    }
3698
3699    #[test]
3700    fn test_sanitize_relative_path_rejects_absolute() {
3701        assert!(RefgetStore::sanitize_relative_path("/etc/passwd").is_err());
3702        assert!(RefgetStore::sanitize_relative_path("\\windows\\system32").is_err());
3703    }
3704
3705    #[test]
3706    fn test_sanitize_relative_path_accepts_valid() {
3707        assert!(RefgetStore::sanitize_relative_path("sequences/ab/abc123.seq").is_ok());
3708        assert!(RefgetStore::sanitize_relative_path("collections/xyz.rgsi").is_ok());
3709        assert!(RefgetStore::sanitize_relative_path("rgstore.json").is_ok());
3710        assert!(RefgetStore::sanitize_relative_path("sequences/%s2/%s.seq").is_ok());
3711    }
3712
3713    #[test]
3714    fn test_stale_rgsi_cache_is_ignored() {
3715        // Reproduces issue where empty/stale .rgsi cache causes
3716        // "Sequence not found in metadata. Available (0 total): []"
3717        use std::io::Write;
3718
3719        let temp_dir = tempdir().unwrap();
3720
3721        // Create a test FASTA file
3722        let fasta_path = temp_dir.path().join("test.fa");
3723        let mut fasta_file = fs::File::create(&fasta_path).unwrap();
3724        writeln!(fasta_file, ">chr1\nATGCATGC\n>chr2\nGGGGAAAA").unwrap();
3725
3726        // Create an EMPTY .rgsi cache file (simulating stale/corrupt cache)
3727        let rgsi_path = temp_dir.path().join("test.rgsi");
3728        let mut rgsi_file = fs::File::create(&rgsi_path).unwrap();
3729        writeln!(
3730            rgsi_file,
3731            "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
3732        )
3733        .unwrap();
3734
3735        // Create on-disk store
3736        let store_path = temp_dir.path().join("store");
3737        let mut store = RefgetStore::on_disk(&store_path).unwrap();
3738
3739        // Before fix: Failed with "Sequence 'chr1' not found in metadata. Available (0 total): []"
3740        // After fix: Detects empty cache, deletes it, re-digests FASTA
3741        let result = store.add_sequence_collection_from_fasta(&fasta_path);
3742        assert!(
3743            result.is_ok(),
3744            "Should handle stale cache: {:?}",
3745            result.err()
3746        );
3747
3748        // Verify sequences were loaded
3749        assert_eq!(store.sequence_store.len(), 2, "Should have 2 sequences");
3750
3751        println!("✓ Stale RGSI cache test passed");
3752    }
3753}