1use crate::digest::{AlphabetType, lookup_alphabet};
28use seq_io::fasta::{Reader, Record};
29use std::collections::HashMap;
30use std::ffi::OsStr;
31use std::fmt::{Display, Formatter};
32use std::path::{Path, PathBuf};
33use std::time::Instant;
34
35use crate::collection::{
36 SequenceCollectionExt, SequenceCollectionRecordExt, SequenceMetadataExt, SequenceRecordExt,
37 read_rgsi_file,
38};
39use crate::digest::{
40 SequenceCollection, SequenceCollectionMetadata, SequenceCollectionRecord, SequenceMetadata,
41 SequenceRecord, parse_rgsi_line,
42};
43use crate::digest::{
44 SequenceEncoder, decode_string_from_bytes, decode_substring_from_bytes, encode_sequence,
45};
46use crate::hashkeyable::HashKeyable;
47use anyhow::anyhow;
48use anyhow::{Context, Result};
49use chrono::Utc;
50use flate2::Compression;
51use flate2::read::GzDecoder;
52use flate2::write::GzEncoder;
53use gtars_core::utils::{get_dynamic_reader, get_file_info, parse_bedlike_file};
54use serde::{Deserialize, Serialize};
55use std::fs::{self, File, create_dir_all};
56use std::io::{BufRead, BufReader, Read, Write};
57use std::str;
58
59const DEFAULT_COLLECTION_ID: &str = "DEFAULT_REFGET_SEQUENCE_COLLECTION"; const DEFAULT_SEQDATA_PATH_TEMPLATE: &str = "sequences/%s2/%s.seq"; fn parse_rgci_line(line: &str) -> Option<SequenceCollectionMetadata> {
72 if line.starts_with('#') {
73 return None;
74 }
75 let parts: Vec<&str> = line.split('\t').collect();
76 if parts.len() < 5 {
77 return None;
78 }
79 Some(SequenceCollectionMetadata {
80 digest: parts[0].to_string(),
81 n_sequences: parts[1].parse().ok()?,
82 names_digest: parts[2].to_string(),
83 sequences_digest: parts[3].to_string(),
84 lengths_digest: parts[4].to_string(),
85 file_path: None,
86 })
87}
88
89#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)]
91pub enum StorageMode {
92 Raw,
93 Encoded,
94}
95
96#[derive(Debug, Clone, PartialEq, Eq)]
97pub struct RetrievedSequence {
98 pub sequence: String,
99 pub chrom_name: String,
100 pub start: u32,
101 pub end: u32,
102}
103
104#[derive(Debug)]
110pub struct RefgetStore {
111 sequence_store: HashMap<[u8; 32], SequenceRecord>,
113 md5_lookup: HashMap<[u8; 32], [u8; 32]>,
115
116 name_lookup: HashMap<[u8; 32], HashMap<String, [u8; 32]>>,
118 collections: HashMap<[u8; 32], SequenceCollectionRecord>,
120 mode: StorageMode,
122 local_path: Option<PathBuf>,
124 remote_source: Option<String>,
126 seqdata_path_template: Option<String>,
128 persist_to_disk: bool,
130 quiet: bool,
132}
133
134#[derive(Serialize, Deserialize, Debug)]
137struct StoreMetadata {
138 version: u32,
140 seqdata_path_template: String,
142 collections_path_template: String,
144 sequence_index: String,
146 #[serde(default)]
148 collection_index: Option<String>,
149 mode: StorageMode,
151 created_at: String,
153}
154
155pub struct SubstringsFromRegions<'a, K>
156where
157 K: AsRef<[u8]>,
158{
159 store: &'a mut RefgetStore,
160 reader: BufReader<Box<dyn Read>>,
161 collection_digest: K,
162 previous_parsed_chr: String,
163 current_seq_digest: String,
164 line_num: usize,
165}
166
167impl<K> Iterator for SubstringsFromRegions<'_, K>
168where
169 K: AsRef<[u8]>,
170{
171 type Item = Result<RetrievedSequence, Box<dyn std::error::Error>>;
172
173 fn next(&mut self) -> Option<Self::Item> {
174 let mut line_string = String::new();
175
176 let num_bytes = self.reader.read_line(&mut line_string);
177 match num_bytes {
178 Ok(bytes) => {
179 if bytes == 0 {
180 return None;
181 }
182 }
183 Err(err) => return Some(Err(err.into())),
184 };
185
186 self.line_num += 1;
187
188 let (parsed_chr, parsed_start, parsed_end) = match parse_bedlike_file(line_string.trim()) {
189 Some(coords) => coords,
190 None => {
191 let err_str = format!(
192 "Error reading line {} because it could not be parsed as a BED-like entry: '{}'",
193 self.line_num + 1,
194 line_string
195 );
196 return Some(Err(err_str.into()));
197 }
198 };
199
200 if parsed_start == -1 || parsed_end == -1 {
201 let err_str = format!(
202 "Error reading line {} due to invalid start or end coordinates: '{}'",
203 self.line_num + 1,
204 line_string
205 );
206 return Some(Err(err_str.into()));
207 }
208
209 if self.previous_parsed_chr != parsed_chr {
210 self.previous_parsed_chr = parsed_chr.clone();
211
212 let result = match self
213 .store
214 .get_sequence_by_name(&self.collection_digest, &parsed_chr)
215 {
216 Ok(seq_record) => seq_record,
217 Err(e) => {
218 let err_str = format!(
219 "Line {}: sequence '{}' not found in collection '{}': {}",
220 self.line_num + 1,
221 parsed_chr,
222 String::from_utf8_lossy(self.collection_digest.as_ref()),
223 e
224 );
225 return Some(Err(err_str.into()));
226 }
227 };
228
229 self.current_seq_digest = result.metadata().sha512t24u.clone();
230 }
231
232 let retrieved_substring = match self.store.get_substring(
233 &self.current_seq_digest,
234 parsed_start as usize,
235 parsed_end as usize,
236 ) {
237 Ok(substring) => substring,
238 Err(e) => {
239 let err_str = format!(
240 "Line {}: failed to get substring for digest '{}' from {} to {}: {}",
241 self.line_num + 1,
242 self.current_seq_digest,
243 parsed_start,
244 parsed_end,
245 e
246 );
247 return Some(Err(err_str.into()));
248 }
249 };
250
251 Some(Ok(RetrievedSequence {
252 sequence: retrieved_substring,
253 chrom_name: parsed_chr,
254 start: parsed_start as u32, end: parsed_end as u32, }))
257 }
258}
259
260impl RefgetStore {
261 fn new(mode: StorageMode) -> Self {
264 let mut name_lookup = HashMap::new();
266 name_lookup.insert(DEFAULT_COLLECTION_ID.to_key(), HashMap::new());
267
268 RefgetStore {
269 sequence_store: HashMap::new(),
270 md5_lookup: HashMap::new(),
271 name_lookup,
272 collections: HashMap::new(),
273 mode,
274 local_path: None,
275 remote_source: None,
276 seqdata_path_template: None,
277 persist_to_disk: false, quiet: false,
279 }
280 }
281
282 pub fn set_quiet(&mut self, quiet: bool) {
290 self.quiet = quiet;
291 }
292
293 pub fn is_quiet(&self) -> bool {
295 self.quiet
296 }
297
298 pub fn on_disk<P: AsRef<Path>>(cache_path: P) -> Result<Self> {
316 let cache_path = cache_path.as_ref();
317 let index_path = cache_path.join("rgstore.json");
318
319 if index_path.exists() {
320 Self::open_local(cache_path)
322 } else {
323 let mode = StorageMode::Encoded;
325 create_dir_all(cache_path)?;
326
327 let mut store = Self::new(mode);
329 store.local_path = Some(cache_path.to_path_buf());
330 store.seqdata_path_template = Some(DEFAULT_SEQDATA_PATH_TEMPLATE.to_string());
331 store.persist_to_disk = true; create_dir_all(cache_path.join("sequences"))?;
335 create_dir_all(cache_path.join("collections"))?;
336
337 Ok(store)
338 }
339 }
340
341 pub fn in_memory() -> Self {
353 Self::new(StorageMode::Encoded)
354 }
355
356 pub fn set_encoding_mode(&mut self, new_mode: StorageMode) {
370 if self.mode == new_mode {
371 return; }
373
374 for record in self.sequence_store.values_mut() {
376 match record {
377 SequenceRecord::Full { metadata, sequence } => {
378 match (self.mode, new_mode) {
379 (StorageMode::Raw, StorageMode::Encoded) => {
380 let alphabet = lookup_alphabet(&metadata.alphabet);
382 *sequence = encode_sequence(&*sequence, alphabet);
383 }
384 (StorageMode::Encoded, StorageMode::Raw) => {
385 let alphabet = lookup_alphabet(&metadata.alphabet);
387 *sequence =
388 decode_string_from_bytes(&*sequence, metadata.length, alphabet);
389 }
390 _ => {} }
392 }
393 SequenceRecord::Stub(_) => {
394 }
396 }
397 }
398
399 self.mode = new_mode;
400 }
401
402 pub fn enable_encoding(&mut self) {
405 self.set_encoding_mode(StorageMode::Encoded);
406 }
407
408 pub fn disable_encoding(&mut self) {
411 self.set_encoding_mode(StorageMode::Raw);
412 }
413
414 pub fn enable_persistence<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
425 let path = path.as_ref();
426
427 self.local_path = Some(path.to_path_buf());
429 self.persist_to_disk = true;
430 self.seqdata_path_template
431 .get_or_insert_with(|| DEFAULT_SEQDATA_PATH_TEMPLATE.to_string());
432
433 create_dir_all(path.join("sequences"))?;
435 create_dir_all(path.join("collections"))?;
436
437 let keys: Vec<[u8; 32]> = self.sequence_store.keys().cloned().collect();
439 for key in keys {
440 if let Some(SequenceRecord::Full { metadata, sequence }) = self.sequence_store.get(&key)
441 {
442 self.write_sequence_to_disk_single(metadata, sequence)?;
444 let stub = SequenceRecord::Stub(metadata.clone());
446 self.sequence_store.insert(key, stub);
447 }
448 }
449
450 for record in self.collections.values() {
452 self.write_collection_to_disk_single(record)?;
453 }
454
455 self.write_index_files()?;
457
458 Ok(())
459 }
460
461 pub fn disable_persistence(&mut self) {
466 self.persist_to_disk = false;
467 }
468
469 pub fn is_persisting(&self) -> bool {
471 self.persist_to_disk
472 }
473
474 pub fn add_sequence<T: Into<Option<[u8; 32]>>>(
485 &mut self,
486 sequence_record: SequenceRecord,
487 collection_digest: T,
488 force: bool,
489 ) -> Result<()> {
490 let collection_digest = collection_digest
492 .into()
493 .unwrap_or(DEFAULT_COLLECTION_ID.to_key());
494 self.collections.get(&collection_digest).ok_or_else(|| {
495 anyhow::anyhow!("Collection not found for digest: {:?}", collection_digest)
496 })?;
497
498 let metadata = sequence_record.metadata();
500
501 self.name_lookup
503 .entry(collection_digest)
504 .or_default()
505 .insert(metadata.name.clone(), metadata.sha512t24u.to_key());
506
507 self.add_sequence_record(sequence_record, force)?;
509
510 Ok(())
511 }
512
513 pub fn add_sequence_collection(&mut self, collection: SequenceCollection) -> Result<()> {
521 self.add_sequence_collection_internal(collection, false)
522 }
523
524 pub fn add_sequence_collection_force(&mut self, collection: SequenceCollection) -> Result<()> {
532 self.add_sequence_collection_internal(collection, true)
533 }
534
535 fn add_sequence_collection_internal(
537 &mut self,
538 collection: SequenceCollection,
539 force: bool,
540 ) -> Result<()> {
541 let coll_digest = collection.metadata.digest.to_key();
542
543 if !force && self.collections.contains_key(&coll_digest) {
545 return Ok(());
547 }
548
549 let record = SequenceCollectionRecord::from(collection.clone());
551
552 if self.persist_to_disk && self.local_path.is_some() {
554 self.write_collection_to_disk_single(&record)?;
555 }
556
557 self.collections.insert(coll_digest, record);
559
560 for sequence_record in collection.sequences {
562 self.add_sequence(sequence_record, coll_digest, force)?;
563 }
564
565 if self.persist_to_disk && self.local_path.is_some() {
567 self.write_index_files()?;
568 }
569
570 Ok(())
571 }
572
573 fn add_sequence_record(&mut self, sr: SequenceRecord, force: bool) -> Result<()> {
577 let metadata = sr.metadata();
578 let key = metadata.sha512t24u.to_key();
579
580 if !force && self.sequence_store.contains_key(&key) {
582 return Ok(());
584 }
585
586 self.md5_lookup
587 .insert(metadata.md5.to_key(), metadata.sha512t24u.to_key());
588
589 if self.persist_to_disk && self.local_path.is_some() {
591 match &sr {
592 SequenceRecord::Full { metadata, sequence } => {
593 self.write_sequence_to_disk_single(metadata, sequence)?;
595 let stub = SequenceRecord::Stub(metadata.clone());
597 self.sequence_store.insert(key, stub);
598 return Ok(());
599 }
600 SequenceRecord::Stub(_) => {
601 }
603 }
604 }
605
606 self.sequence_store.insert(key, sr);
608 Ok(())
609 }
610
611 pub fn add_sequence_collection_from_fasta<P: AsRef<Path>>(
628 &mut self,
629 file_path: P,
630 ) -> Result<(SequenceCollectionMetadata, bool)> {
631 self.add_sequence_collection_from_fasta_internal(file_path, false)
632 }
633
634 pub fn add_sequence_collection_from_fasta_force<P: AsRef<Path>>(
646 &mut self,
647 file_path: P,
648 ) -> Result<(SequenceCollectionMetadata, bool)> {
649 self.add_sequence_collection_from_fasta_internal(file_path, true)
650 }
651
652 fn add_sequence_collection_from_fasta_internal<P: AsRef<Path>>(
655 &mut self,
656 file_path: P,
657 force: bool,
658 ) -> Result<(SequenceCollectionMetadata, bool)> {
659 if !self.quiet {
661 println!("Processing {}...", file_path.as_ref().display());
662 }
663
664 let digest_start = Instant::now();
666 let seqcol = SequenceCollection::from_fasta(&file_path)?;
667 let digest_elapsed = digest_start.elapsed();
668
669 let metadata = seqcol.metadata.clone();
671
672 if !force
674 && self
675 .collections
676 .contains_key(&seqcol.metadata.digest.to_key())
677 {
678 if !self.quiet {
679 println!("Skipped {} (already exists)", seqcol.metadata.digest);
680 }
681 return Ok((metadata, false));
682 }
683
684 self.add_sequence_collection_internal(seqcol.clone(), force)?;
686
687 let mut seqmeta_hashmap: HashMap<String, SequenceMetadata> = HashMap::new();
689 let seqcol_sequences = seqcol.sequences.clone(); for record in seqcol_sequences {
691 let seqmeta = record.metadata().clone();
692 seqmeta_hashmap.insert(seqmeta.name.clone(), seqmeta);
693 }
694
695 let file_reader = get_dynamic_reader(file_path.as_ref())?;
696 let mut fasta_reader = Reader::new(file_reader);
697
698 let encode_start = Instant::now();
700
701 let mut seq_count = 0;
702 while let Some(record) = fasta_reader.next() {
703 let record = record?;
704 let header = std::str::from_utf8(record.head())?;
705 let (name, _description) = crate::fasta::parse_fasta_header(header);
707 let dr = seqmeta_hashmap
708 .get(&name)
709 .ok_or_else(|| {
710 let available_keys: Vec<_> = seqmeta_hashmap.keys().collect();
711 let total = available_keys.len();
712 let sample: Vec<_> = available_keys.iter().take(3).collect();
713 anyhow::anyhow!(
714 "Sequence '{}' not found in metadata. Available ({} total): {:?}{}",
715 name,
716 total,
717 sample,
718 if total > 3 { " ..." } else { "" }
719 )
720 })?
721 .clone();
722
723 seq_count += 1;
724
725 match self.mode {
726 StorageMode::Raw => {
727 let mut raw_sequence = Vec::with_capacity(dr.length);
728 for seq_line in record.seq_lines() {
730 raw_sequence.extend(seq_line);
731 }
732
733 self.add_sequence(
735 SequenceRecord::Full {
736 metadata: dr,
737 sequence: raw_sequence,
738 },
739 seqcol.metadata.digest.to_key(),
740 true, )?;
742 }
743 StorageMode::Encoded => {
744 let mut encoder = SequenceEncoder::new(dr.alphabet, dr.length);
746 for seq_line in record.seq_lines() {
747 encoder.update(seq_line);
748 }
749 let encoded_sequence = encoder.finalize();
750
751 self.add_sequence(
753 SequenceRecord::Full {
754 metadata: dr,
755 sequence: encoded_sequence,
756 },
757 seqcol.metadata.digest.to_key(),
758 true, )?;
760 }
761 }
762 }
763
764 let encode_elapsed = encode_start.elapsed();
765
766 if !self.quiet {
768 println!(
769 "Added {} ({} seqs) in {:.1}s [{:.1}s digest + {:.1}s encode]",
770 seqcol.metadata.digest,
771 seq_count,
772 digest_elapsed.as_secs_f64() + encode_elapsed.as_secs_f64(),
773 digest_elapsed.as_secs_f64(),
774 encode_elapsed.as_secs_f64()
775 );
776 }
777
778 Ok((metadata, true))
782 }
783
784 pub fn sequence_digests(&self) -> impl Iterator<Item = [u8; 32]> + '_ {
786 self.sequence_store.keys().cloned()
787 }
788
789 pub fn sequence_metadata(&self) -> impl Iterator<Item = &SequenceMetadata> + '_ {
804 self.sequence_store.values().map(|rec| rec.metadata())
805 }
806
807 pub fn total_disk_size(&self) -> usize {
829 self.sequence_store
830 .values()
831 .map(|rec| rec.metadata().disk_size(&self.mode))
832 .sum()
833 }
834
835 pub fn actual_disk_usage(&self) -> usize {
843 let Some(path) = &self.local_path else {
844 return 0;
845 };
846
847 fn dir_size(path: &std::path::Path) -> usize {
848 let mut total = 0;
849 if let Ok(entries) = std::fs::read_dir(path) {
850 for entry in entries.flatten() {
851 let path = entry.path();
852 if path.is_file() {
853 total += entry.metadata().map(|m| m.len() as usize).unwrap_or(0);
854 } else if path.is_dir() {
855 total += dir_size(&path);
856 }
857 }
858 }
859 total
860 }
861
862 dir_size(path)
863 }
864
865 pub fn list_collections(&self) -> Vec<SequenceCollectionMetadata> {
881 let mut result: Vec<_> = self
882 .collections
883 .values()
884 .map(|record| record.metadata().clone())
885 .collect();
886 result.sort_by(|a, b| a.digest.cmp(&b.digest));
887 result
888 }
889
890 pub fn get_collection_metadata<K: AsRef<[u8]>>(
894 &self,
895 collection_digest: K,
896 ) -> Option<&SequenceCollectionMetadata> {
897 let key = collection_digest.to_key();
898 self.collections.get(&key).map(|record| record.metadata())
899 }
900
901 pub fn get_collection(&mut self, collection_digest: &str) -> Result<SequenceCollection> {
914 let key = collection_digest.to_key();
915 self.ensure_collection_loaded(&key)?;
916
917 let seq_digests: Vec<[u8; 32]> = self
919 .name_lookup
920 .get(&key)
921 .map(|name_map| name_map.values().cloned().collect())
922 .unwrap_or_default();
923
924 let metadata = self
930 .collections
931 .get(&key)
932 .ok_or_else(|| anyhow!("Collection not found: {}", collection_digest))?
933 .metadata()
934 .clone();
935
936 let sequences: Vec<SequenceRecord> = seq_digests
938 .iter()
939 .filter_map(|seq_key| self.sequence_store.get(seq_key).cloned())
940 .collect();
941
942 Ok(SequenceCollection {
943 metadata,
944 sequences,
945 })
946 }
947
948 pub fn list_sequences(&self) -> Vec<SequenceMetadata> {
964 let mut result: Vec<_> = self
965 .sequence_store
966 .values()
967 .map(|rec| rec.metadata().clone())
968 .collect();
969 result.sort_by(|a, b| a.sha512t24u.cmp(&b.sha512t24u));
970 result
971 }
972
973 pub fn get_sequence_metadata<K: AsRef<[u8]>>(
977 &self,
978 seq_digest: K,
979 ) -> Option<&SequenceMetadata> {
980 let key = seq_digest.to_key();
981 self.sequence_store.get(&key).map(|rec| rec.metadata())
982 }
983
984 pub fn get_sequence<K: AsRef<[u8]>>(&mut self, seq_digest: K) -> Result<&SequenceRecord> {
992 let digest_key = seq_digest.to_key();
993 let actual_key = self
995 .md5_lookup
996 .get(&digest_key)
997 .copied()
998 .unwrap_or(digest_key);
999 self.ensure_sequence_loaded(&actual_key)?;
1000 self.sequence_store.get(&actual_key).ok_or_else(|| {
1001 anyhow!(
1002 "Sequence not found: {}",
1003 String::from_utf8_lossy(seq_digest.as_ref())
1004 )
1005 })
1006 }
1007
1008 pub fn get_sequence_by_name<K: AsRef<[u8]>>(
1016 &mut self,
1017 collection_digest: K,
1018 sequence_name: &str,
1019 ) -> Result<&SequenceRecord> {
1020 let collection_key = collection_digest.to_key();
1021 self.ensure_collection_loaded(&collection_key)?;
1022
1023 let digest_key = if let Some(name_map) = self.name_lookup.get(&collection_key) {
1024 name_map
1025 .get(sequence_name)
1026 .cloned()
1027 .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", sequence_name))?
1028 } else {
1029 return Err(anyhow!(
1030 "Collection not found: {}",
1031 String::from_utf8_lossy(collection_digest.as_ref())
1032 ));
1033 };
1034
1035 self.ensure_sequence_loaded(&digest_key)?;
1036 self.sequence_store.get(&digest_key).ok_or_else(|| {
1037 anyhow!(
1038 "Sequence record not found for '{}' after loading",
1039 sequence_name
1040 )
1041 })
1042 }
1043
1044 pub fn iter_collections(&mut self) -> impl Iterator<Item = SequenceCollection> {
1058 let mut digests: Vec<String> = self
1060 .collections
1061 .values()
1062 .map(|rec| rec.metadata().digest.clone())
1063 .collect();
1064 digests.sort();
1065
1066 let mut collections = Vec::new();
1068 for digest in digests {
1069 if let Ok(collection) = self.get_collection(&digest) {
1070 collections.push(collection);
1071 }
1072 }
1073 collections.into_iter()
1074 }
1075
1076 pub fn iter_sequences(&mut self) -> impl Iterator<Item = SequenceRecord> {
1090 let keys: Vec<[u8; 32]> = self.sequence_store.keys().cloned().collect();
1092
1093 for key in &keys {
1095 let _ = self.ensure_sequence_loaded(key);
1096 }
1097
1098 let mut records: Vec<_> = self.sequence_store.values().cloned().collect();
1100 records.sort_by(|a, b| a.metadata().sha512t24u.cmp(&b.metadata().sha512t24u));
1101 records.into_iter()
1102 }
1103
1104 pub fn is_collection_loaded<K: AsRef<[u8]>>(&self, collection_digest: K) -> bool {
1106 let key = collection_digest.to_key();
1107 self.collections
1108 .get(&key)
1109 .map_or(false, |record| record.has_sequences())
1110 }
1111
1112 pub fn local_path(&self) -> Option<&PathBuf> {
1114 self.local_path.as_ref()
1115 }
1116
1117 pub fn remote_source(&self) -> Option<&str> {
1119 self.remote_source.as_deref()
1120 }
1121
1122 pub fn storage_mode(&self) -> StorageMode {
1124 self.mode
1125 }
1126
1127 pub fn substrings_from_regions<'a, K: AsRef<[u8]>>(
1148 &'a mut self,
1149 collection_digest: K,
1150 bed_file_path: &str,
1151 ) -> Result<SubstringsFromRegions<'a, K>, Box<dyn std::error::Error>> {
1152 let path = Path::new(bed_file_path);
1153 let file_info = get_file_info(path);
1154 let is_gzipped = file_info.is_gzipped;
1155
1156 let opened_bed_file = File::open(path)?;
1157
1158 let reader: Box<dyn Read> = match is_gzipped {
1159 true => Box::new(GzDecoder::new(BufReader::new(opened_bed_file))),
1160 false => Box::new(opened_bed_file),
1161 };
1162 let reader = BufReader::new(reader);
1163
1164 Ok(SubstringsFromRegions {
1165 store: self,
1166 reader,
1167 collection_digest,
1168 previous_parsed_chr: String::new(),
1169 current_seq_digest: String::new(),
1170 line_num: 0,
1171 })
1172 }
1173
1174 pub fn export_fasta_from_regions<K: AsRef<[u8]>>(
1197 &mut self,
1198 collection_digest: K,
1199 bed_file_path: &str,
1200 output_file_path: &str,
1201 ) -> Result<(), Box<dyn std::error::Error>> {
1202 let output_path_obj = Path::new(output_file_path);
1204 if let Some(parent) = output_path_obj.parent() {
1205 create_dir_all(parent)?;
1206 }
1207
1208 let file = File::create(output_file_path)?;
1210
1211 let mut writer: Box<dyn Write> = if output_path_obj.extension() == Some(OsStr::new("gz")) {
1212 Box::new(GzEncoder::new(file, Compression::default()))
1213 } else {
1214 Box::new(file)
1215 };
1216
1217 let collection_key = collection_digest.as_ref().to_key();
1219
1220 self.ensure_collection_loaded(&collection_key)?;
1222
1223 let name_to_metadata: HashMap<String, (String, usize, AlphabetType, String, String)> = self
1224 .name_lookup
1225 .get(&collection_key)
1226 .map(|name_map| {
1227 name_map
1228 .iter()
1229 .filter_map(|(name, seq_digest)| {
1230 self.sequence_store.get(seq_digest).map(|record| {
1231 let metadata = record.metadata();
1232 (
1233 name.clone(),
1234 (
1235 metadata.name.clone(),
1236 metadata.length,
1237 metadata.alphabet,
1238 metadata.sha512t24u.clone(),
1239 metadata.md5.clone(),
1240 ),
1241 )
1242 })
1243 })
1244 .collect()
1245 })
1246 .unwrap_or_default();
1247
1248 let seq_iter = self.substrings_from_regions(&collection_digest, bed_file_path)?;
1249
1250 let mut previous_parsed_chr = String::new();
1251 let mut current_header: String = String::new();
1252 let mut previous_header: String = String::new();
1253
1254 for rs in seq_iter.into_iter() {
1255 let rs = rs?;
1256
1257 if previous_parsed_chr != rs.chrom_name {
1258 previous_parsed_chr = rs.chrom_name.clone();
1259
1260 if let Some((name, length, alphabet, sha512, md5)) =
1262 name_to_metadata.get(&rs.chrom_name)
1263 {
1264 current_header =
1265 format!(">{} {} {} {} {}", name, length, alphabet, sha512, md5);
1266 }
1267 }
1268
1269 let retrieved_substring = rs.sequence;
1270
1271 if previous_header != current_header {
1272 let prefix = if previous_header.is_empty() { "" } else { "\n" };
1273
1274 previous_header = current_header.clone();
1275
1276 let header_to_be_written = format!("{}{}\n", prefix, current_header);
1278 writer.write_all(header_to_be_written.as_bytes())?;
1279 }
1280
1281 writer.write_all(retrieved_substring.as_ref())?;
1282 }
1283
1284 writer.flush()?;
1286
1287 Ok(())
1288 }
1289
1290 pub fn get_substring<K: AsRef<[u8]>>(
1302 &mut self,
1303 sha512_digest: K,
1304 start: usize,
1305 end: usize,
1306 ) -> Result<String> {
1307 let digest_key = sha512_digest.to_key();
1308
1309 self.ensure_sequence_loaded(&digest_key)?;
1311
1312 let record = self.sequence_store.get(&digest_key).ok_or_else(|| {
1313 anyhow!(
1314 "Sequence not found: {}",
1315 String::from_utf8_lossy(sha512_digest.as_ref())
1316 )
1317 })?;
1318 let (metadata, sequence) = match record {
1319 SequenceRecord::Stub(_) => return Err(anyhow!("Sequence data not loaded (stub only)")),
1320 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
1321 };
1322
1323 if start >= metadata.length || end > metadata.length || start >= end {
1324 return Err(anyhow!(
1325 "Invalid substring range: start={}, end={}, sequence length={}",
1326 start,
1327 end,
1328 metadata.length
1329 ));
1330 }
1331
1332 match self.mode {
1333 StorageMode::Encoded => {
1334 let alphabet = lookup_alphabet(&metadata.alphabet);
1335 let decoded_sequence = decode_substring_from_bytes(sequence, start, end, alphabet);
1336 String::from_utf8(decoded_sequence)
1337 .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
1338 }
1339 StorageMode::Raw => {
1340 let raw_slice: &[u8] = &sequence[start..end];
1341 String::from_utf8(raw_slice.to_vec())
1342 .map_err(|e| anyhow!("Failed to decode UTF-8 sequence: {}", e))
1343 }
1344 }
1345 }
1346
1347 pub fn export_fasta<K: AsRef<[u8]>, P: AsRef<Path>>(
1359 &mut self,
1360 collection_digest: K,
1361 output_path: P,
1362 sequence_names: Option<Vec<&str>>,
1363 line_width: Option<usize>,
1364 ) -> Result<()> {
1365 let line_width = line_width.unwrap_or(80);
1366 let output_path = output_path.as_ref();
1367 let collection_key = collection_digest.as_ref().to_key();
1368
1369 self.ensure_collection_loaded(&collection_key)?;
1371
1372 let name_to_digest: HashMap<String, [u8; 32]> = self
1374 .name_lookup
1375 .get(&collection_key)
1376 .ok_or_else(|| {
1377 anyhow!(
1378 "Collection not found: {:?}",
1379 String::from_utf8_lossy(collection_digest.as_ref())
1380 )
1381 })?
1382 .clone();
1383
1384 let names_to_export: Vec<String> = if let Some(names) = sequence_names {
1386 names.iter().map(|s| s.to_string()).collect()
1388 } else {
1389 name_to_digest.keys().cloned().collect()
1391 };
1392
1393 let file = File::create(output_path).context(format!(
1395 "Failed to create output file: {}",
1396 output_path.display()
1397 ))?;
1398
1399 let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
1400 Box::new(GzEncoder::new(file, Compression::default()))
1401 } else {
1402 Box::new(file)
1403 };
1404
1405 for seq_name in names_to_export {
1407 let seq_digest = name_to_digest
1409 .get(&seq_name)
1410 .ok_or_else(|| anyhow!("Sequence '{}' not found in collection", seq_name))?;
1411
1412 self.ensure_sequence_loaded(seq_digest)?;
1414
1415 let record = self
1417 .sequence_store
1418 .get(seq_digest)
1419 .ok_or_else(|| anyhow!("Sequence record not found for digest: {:?}", seq_digest))?;
1420
1421 let (metadata, sequence_data) = match record {
1423 SequenceRecord::Stub(_) => {
1424 return Err(anyhow!("Sequence data not loaded for '{}'", seq_name));
1425 }
1426 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
1427 };
1428
1429 let decoded_sequence = match self.mode {
1431 StorageMode::Encoded => {
1432 let alphabet = lookup_alphabet(&metadata.alphabet);
1433 let decoded =
1434 decode_substring_from_bytes(sequence_data, 0, metadata.length, alphabet);
1435 String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")?
1436 }
1437 StorageMode::Raw => String::from_utf8(sequence_data.clone())
1438 .context("Failed to decode raw sequence as UTF-8")?,
1439 };
1440
1441 let header = match &metadata.description {
1443 Some(desc) => format!(">{} {}", metadata.name, desc),
1444 None => format!(">{}", metadata.name),
1445 };
1446 writeln!(writer, "{}", header)?;
1447
1448 for chunk in decoded_sequence.as_bytes().chunks(line_width) {
1450 writer.write_all(chunk)?;
1451 writer.write_all(b"\n")?;
1452 }
1453 }
1454
1455 writer.flush()?;
1457
1458 Ok(())
1459 }
1460
1461 pub fn export_fasta_by_digests<P: AsRef<Path>>(
1472 &mut self,
1473 seq_digests: Vec<&str>,
1474 output_path: P,
1475 line_width: Option<usize>,
1476 ) -> Result<()> {
1477 let line_width = line_width.unwrap_or(80);
1478 let output_path = output_path.as_ref();
1479
1480 let file = File::create(output_path).context(format!(
1482 "Failed to create output file: {}",
1483 output_path.display()
1484 ))?;
1485
1486 let mut writer: Box<dyn Write> = if output_path.extension() == Some(OsStr::new("gz")) {
1487 Box::new(GzEncoder::new(file, Compression::default()))
1488 } else {
1489 Box::new(file)
1490 };
1491
1492 for digest_str in seq_digests {
1494 let digest_key = digest_str.as_bytes().to_key();
1495
1496 self.ensure_sequence_loaded(&digest_key)?;
1498
1499 let record = self
1501 .sequence_store
1502 .get(&digest_key)
1503 .ok_or_else(|| anyhow!("Sequence record not found for digest: {}", digest_str))?;
1504
1505 let (metadata, sequence_data) = match record {
1507 SequenceRecord::Stub(_) => {
1508 return Err(anyhow!(
1509 "Sequence data not loaded for digest: {}",
1510 digest_str
1511 ));
1512 }
1513 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
1514 };
1515
1516 let decoded_sequence = match self.mode {
1518 StorageMode::Encoded => {
1519 let alphabet = lookup_alphabet(&metadata.alphabet);
1520 let decoded =
1521 decode_substring_from_bytes(sequence_data, 0, metadata.length, alphabet);
1522 String::from_utf8(decoded).context("Failed to decode sequence as UTF-8")?
1523 }
1524 StorageMode::Raw => String::from_utf8(sequence_data.clone())
1525 .context("Failed to decode raw sequence as UTF-8")?,
1526 };
1527
1528 let header = match &metadata.description {
1530 Some(desc) => format!(">{} {}", metadata.name, desc),
1531 None => format!(">{}", metadata.name),
1532 };
1533 writeln!(writer, "{}", header)?;
1534
1535 for chunk in decoded_sequence.as_bytes().chunks(line_width) {
1537 writer.write_all(chunk)?;
1538 writer.write_all(b"\n")?;
1539 }
1540 }
1541
1542 writer.flush()?;
1544
1545 Ok(())
1546 }
1547
1548 fn get_sequence_path(digest_str: &str, template: &str) -> PathBuf {
1550 let path_str = template
1551 .replace("%s2", &digest_str[0..2])
1552 .replace("%s", digest_str);
1553
1554 PathBuf::from(path_str)
1555 }
1556
1557 fn write_sequence_to_disk_single(
1559 &self,
1560 metadata: &SequenceMetadata,
1561 sequence: &[u8],
1562 ) -> Result<()> {
1563 let template = self
1564 .seqdata_path_template
1565 .as_ref()
1566 .context("seqdata_path_template not set")?;
1567 let local_path = self.local_path.as_ref().context("local_path not set")?;
1568
1569 let seq_file_path = Self::get_sequence_path(&metadata.sha512t24u, template);
1571 let full_path = local_path.join(&seq_file_path);
1572
1573 if let Some(parent) = full_path.parent() {
1575 create_dir_all(parent)?;
1576 }
1577
1578 let mut file = File::create(&full_path)?;
1580 file.write_all(sequence)?;
1581
1582 Ok(())
1583 }
1584
1585 fn write_collection_to_disk_single(&self, record: &SequenceCollectionRecord) -> Result<()> {
1588 let local_path = self.local_path.as_ref().context("local_path not set")?;
1589
1590 let coll_file_path = format!("collections/{}.rgsi", record.metadata().digest);
1592 let full_path = local_path.join(&coll_file_path);
1593
1594 if let Some(parent) = full_path.parent() {
1596 create_dir_all(parent)?;
1597 }
1598
1599 record.write_collection_rgsi(&full_path)?;
1601
1602 Ok(())
1603 }
1604
1605 fn write_index_files(&self) -> Result<()> {
1610 let local_path = self.local_path.as_ref().context("local_path not set")?;
1611 let template = self
1612 .seqdata_path_template
1613 .as_ref()
1614 .context("seqdata_path_template not set")?;
1615
1616 let sequence_index_path = local_path.join("sequences.rgsi");
1618 self.write_sequences_rgsi(&sequence_index_path)?;
1619
1620 let collection_index_path = local_path.join("collections.rgci");
1622 self.write_collections_rgci(&collection_index_path)?;
1623
1624 let metadata = StoreMetadata {
1626 version: 1,
1627 seqdata_path_template: template.clone(),
1628 collections_path_template: "collections/%s.rgsi".to_string(),
1629 sequence_index: "sequences.rgsi".to_string(),
1630 collection_index: Some("collections.rgci".to_string()),
1631 mode: self.mode,
1632 created_at: Utc::now().to_rfc3339(),
1633 };
1634
1635 let json = serde_json::to_string_pretty(&metadata)
1637 .context("Failed to serialize metadata to JSON")?;
1638 fs::write(local_path.join("rgstore.json"), json).context("Failed to write rgstore.json")?;
1639
1640 Ok(())
1641 }
1642
1643 fn write_collections_rgci<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
1648 let file_path = file_path.as_ref();
1649 let mut file = File::create(file_path)?;
1650
1651 writeln!(
1653 file,
1654 "#digest\tn_sequences\tnames_digest\tsequences_digest\tlengths_digest"
1655 )?;
1656
1657 for record in self.collections.values() {
1659 let meta = record.metadata();
1660 writeln!(
1661 file,
1662 "{}\t{}\t{}\t{}\t{}",
1663 meta.digest,
1664 meta.n_sequences,
1665 meta.names_digest,
1666 meta.sequences_digest,
1667 meta.lengths_digest,
1668 )?;
1669 }
1670 Ok(())
1671 }
1672
1673 pub fn write_sequences_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
1678 let file_path = file_path.as_ref();
1679 let mut file = std::fs::File::create(file_path)?;
1680
1681 writeln!(
1683 file,
1684 "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
1685 )?;
1686
1687 for result_sr in self.sequence_store.values() {
1689 let result = result_sr.metadata().clone();
1690 let description = result.description.as_deref().unwrap_or("");
1691 writeln!(
1692 file,
1693 "{}\t{}\t{}\t{}\t{}\t{}",
1694 result.name,
1695 result.length,
1696 result.alphabet,
1697 result.sha512t24u,
1698 result.md5,
1699 description
1700 )?;
1701 }
1702 Ok(())
1703 }
1704
1705 fn sanitize_relative_path(path: &str) -> Result<()> {
1708 if path.starts_with('/') || path.starts_with('\\') {
1709 return Err(anyhow!("Absolute paths not allowed: {}", path));
1710 }
1711 if path.contains("..") {
1712 return Err(anyhow!("Directory traversal not allowed: {}", path));
1713 }
1714 if path.contains('\0') {
1715 return Err(anyhow!("Null bytes not allowed in path"));
1716 }
1717 Ok(())
1718 }
1719
1720 fn fetch_file(
1723 local_path: &Option<PathBuf>,
1724 remote_source: &Option<String>,
1725 relative_path: &str,
1726 persist_to_disk: bool,
1727 ) -> Result<Vec<u8>> {
1728 Self::sanitize_relative_path(relative_path)?;
1730
1731 if persist_to_disk {
1733 if let Some(local_path) = local_path {
1734 let full_local_path = local_path.join(relative_path);
1735 if full_local_path.exists() {
1736 return fs::read(&full_local_path).context(format!(
1737 "Failed to read local file: {}",
1738 full_local_path.display()
1739 ));
1740 }
1741 }
1742 }
1743
1744 if let Some(remote_url) = remote_source {
1746 let full_remote_url = if remote_url.ends_with('/') {
1747 format!("{}{}", remote_url, relative_path)
1748 } else {
1749 format!("{}/{}", remote_url, relative_path)
1750 };
1751
1752 let response = ureq::get(&full_remote_url)
1753 .call()
1754 .map_err(|e| anyhow!("Failed to fetch from remote: {}", e))?;
1755
1756 let mut data = Vec::new();
1757 response
1758 .into_reader()
1759 .read_to_end(&mut data)
1760 .context("Failed to read response body")?;
1761
1762 if persist_to_disk {
1764 if let Some(local_path) = local_path {
1765 let full_local_path = local_path.join(relative_path);
1766
1767 if let Some(parent) = full_local_path.parent() {
1769 create_dir_all(parent)?;
1770 }
1771
1772 fs::write(&full_local_path, &data).context(format!(
1774 "Failed to cache file to: {}",
1775 full_local_path.display()
1776 ))?;
1777 }
1778 }
1779
1780 Ok(data)
1781 } else {
1782 Err(anyhow!(
1783 "File not found locally and no remote source configured: {}",
1784 relative_path
1785 ))
1786 }
1787 }
1788
1789 pub fn open_local<P: AsRef<Path>>(path: P) -> Result<Self> {
1799 let root_path = path.as_ref();
1800
1801 let index_path = root_path.join("rgstore.json");
1803 let json = fs::read_to_string(&index_path).context(format!(
1804 "Failed to read rgstore.json from {}",
1805 index_path.display()
1806 ))?;
1807
1808 let metadata: StoreMetadata =
1809 serde_json::from_str(&json).context("Failed to parse store metadata")?;
1810
1811 Self::sanitize_relative_path(&metadata.seqdata_path_template)?;
1813 Self::sanitize_relative_path(&metadata.sequence_index)?;
1814 if let Some(ref ci) = metadata.collection_index {
1815 Self::sanitize_relative_path(ci)?;
1816 }
1817
1818 let mut store = RefgetStore::new(metadata.mode);
1820 store.local_path = Some(root_path.to_path_buf());
1821 store.seqdata_path_template = Some(metadata.seqdata_path_template.clone());
1822 store.persist_to_disk = true; let sequence_index_path = root_path.join(&metadata.sequence_index);
1826 if sequence_index_path.exists() {
1827 Self::load_sequences_from_index(&mut store, &sequence_index_path)?;
1828 }
1829
1830 if let Some(ref collection_index) = metadata.collection_index {
1832 let collection_index_path = root_path.join(collection_index);
1833 if collection_index_path.exists() {
1834 Self::load_collection_stubs_from_rgci(&mut store, &collection_index_path)?;
1835 }
1836 }
1837
1838 if store.collections.is_empty() {
1840 let collections_dir = root_path.join("collections");
1841 Self::load_collections_from_directory(&mut store, &collections_dir)?;
1842 }
1843
1844 Ok(store)
1845 }
1846
1847 fn load_sequences_from_index(store: &mut RefgetStore, index_path: &Path) -> Result<()> {
1849 let file = std::fs::File::open(index_path)?;
1850 let reader = std::io::BufReader::new(file);
1851
1852 for line in reader.lines() {
1853 let line = line?;
1854
1855 if line.starts_with('#') {
1857 continue;
1858 }
1859
1860 if let Some(seq_metadata) = parse_rgsi_line(&line) {
1862 let record = SequenceRecord::Stub(seq_metadata.clone());
1864
1865 let sha512_key = seq_metadata.sha512t24u.to_key();
1867 store.sequence_store.insert(sha512_key, record);
1868
1869 let md5_key = seq_metadata.md5.to_key();
1871 store.md5_lookup.insert(md5_key, sha512_key);
1872 }
1873 }
1874
1875 Ok(())
1876 }
1877
1878 fn load_collection_stubs_from_rgci(store: &mut RefgetStore, index_path: &Path) -> Result<()> {
1880 let file = std::fs::File::open(index_path)?;
1881 let reader = std::io::BufReader::new(file);
1882
1883 for line in reader.lines() {
1884 let line = line?;
1885
1886 if let Some(metadata) = parse_rgci_line(&line) {
1887 let key = metadata.digest.to_key();
1888 store
1892 .collections
1893 .insert(key, SequenceCollectionRecord::Stub(metadata));
1894 }
1895 }
1896
1897 Ok(())
1898 }
1899
1900 fn load_collections_from_directory(
1904 store: &mut RefgetStore,
1905 collections_dir: &Path,
1906 ) -> Result<()> {
1907 if !collections_dir.exists() {
1908 return Ok(());
1909 }
1910
1911 for entry in fs::read_dir(collections_dir)? {
1912 let entry = entry?;
1913 let path = entry.path();
1914
1915 if path.is_file() && path.extension() == Some(OsStr::new("rgsi")) {
1916 let collection = read_rgsi_file(&path)?;
1918 let collection_digest = collection.metadata.digest.to_key();
1919
1920 let record = SequenceCollectionRecord::from(collection.clone());
1922
1923 store.collections.insert(collection_digest, record);
1925
1926 let mut name_map = HashMap::new();
1928 for sequence_record in &collection.sequences {
1929 let metadata = sequence_record.metadata();
1930 let sha512_key = metadata.sha512t24u.to_key();
1931 name_map.insert(metadata.name.clone(), sha512_key);
1932 }
1933 store.name_lookup.insert(collection_digest, name_map);
1934 }
1935 }
1936
1937 Ok(())
1938 }
1939
1940 pub fn open_remote<P: AsRef<Path>, S: AsRef<str>>(
1953 cache_path: P,
1954 remote_url: S,
1955 ) -> Result<Self> {
1956 let cache_path = cache_path.as_ref();
1957 let remote_url = remote_url.as_ref().to_string();
1958
1959 create_dir_all(cache_path)?;
1961
1962 let index_data = Self::fetch_file(
1964 &Some(cache_path.to_path_buf()),
1965 &Some(remote_url.clone()),
1966 "rgstore.json",
1967 true,
1968 )?;
1969
1970 let json =
1971 String::from_utf8(index_data).context("Store metadata contains invalid UTF-8")?;
1972
1973 let metadata: StoreMetadata =
1974 serde_json::from_str(&json).context("Failed to parse store metadata")?;
1975
1976 Self::sanitize_relative_path(&metadata.seqdata_path_template)?;
1978 Self::sanitize_relative_path(&metadata.sequence_index)?;
1979 if let Some(ref ci) = metadata.collection_index {
1980 Self::sanitize_relative_path(ci)?;
1981 }
1982
1983 let mut store = RefgetStore::new(metadata.mode);
1985 store.local_path = Some(cache_path.to_path_buf());
1986 store.remote_source = Some(remote_url.clone());
1987 store.seqdata_path_template = Some(metadata.seqdata_path_template.clone());
1988 store.persist_to_disk = true; let sequence_index_data = Self::fetch_file(
1992 &Some(cache_path.to_path_buf()),
1993 &Some(remote_url.clone()),
1994 &metadata.sequence_index,
1995 true, )?;
1997 let sequence_index_str = String::from_utf8(sequence_index_data)
1998 .context("sequence index contains invalid UTF-8")?;
1999
2000 for line in sequence_index_str.lines() {
2002 if line.starts_with('#') {
2004 continue;
2005 }
2006
2007 if let Some(seq_metadata) = parse_rgsi_line(line) {
2009 let record = SequenceRecord::Stub(seq_metadata.clone());
2011
2012 let sha512_key = seq_metadata.sha512t24u.to_key();
2014 store.sequence_store.insert(sha512_key, record);
2015
2016 let md5_key = seq_metadata.md5.to_key();
2018 store.md5_lookup.insert(md5_key, sha512_key);
2019 }
2020 }
2021
2022 if let Some(ref collection_index) = metadata.collection_index {
2024 if let Ok(collection_index_data) = Self::fetch_file(
2025 &Some(cache_path.to_path_buf()),
2026 &Some(remote_url.clone()),
2027 collection_index,
2028 true,
2029 ) {
2030 let collection_index_str = String::from_utf8(collection_index_data)
2031 .context("collection index contains invalid UTF-8")?;
2032
2033 for line in collection_index_str.lines() {
2035 if let Some(coll_metadata) = parse_rgci_line(line) {
2036 let key = coll_metadata.digest.to_key();
2037 store
2038 .collections
2039 .insert(key, SequenceCollectionRecord::Stub(coll_metadata));
2040 }
2041 }
2042 }
2043 }
2044
2045 if store.collections.is_empty() {
2047 let local_collections_dir = cache_path.join("collections");
2048 create_dir_all(&local_collections_dir)?; Self::load_collections_from_directory(&mut store, &local_collections_dir)?;
2050 }
2051
2052 Ok(store)
2053 }
2054
2055 fn ensure_collection_loaded(&mut self, collection_digest: &[u8; 32]) -> Result<()> {
2059 if self.name_lookup.contains_key(collection_digest) {
2061 return Ok(());
2062 }
2063
2064 let needs_fetch = match self.collections.get(collection_digest) {
2066 Some(SequenceCollectionRecord::Stub(_)) => true,
2067 Some(SequenceCollectionRecord::Full { .. }) => false,
2068 None => true, };
2070
2071 if needs_fetch {
2072 let digest_str = if let Some(SequenceCollectionRecord::Stub(meta)) =
2074 self.collections.get(collection_digest)
2075 {
2076 meta.digest.clone()
2077 } else {
2078 String::from_utf8_lossy(collection_digest).to_string()
2079 };
2080
2081 let relative_path = format!("collections/{}.rgsi", digest_str);
2082
2083 if !self.quiet {
2086 let cached = self
2087 .local_path
2088 .as_ref()
2089 .map(|p| p.join(&relative_path).exists())
2090 .unwrap_or(false);
2091 let verb = if cached { "Loading" } else { "Downloading" };
2092 eprintln!("{} collection {}...", verb, digest_str);
2093 }
2094 let _collection_data =
2095 Self::fetch_file(&self.local_path, &self.remote_source, &relative_path, true)?;
2096
2097 let local_path = self
2099 .local_path
2100 .as_ref()
2101 .ok_or_else(|| anyhow!("No local path configured"))?;
2102
2103 let collection_file_path = local_path.join(&relative_path);
2104
2105 let collection = read_rgsi_file(&collection_file_path)?;
2106
2107 let loaded_digest = collection.metadata.digest.to_key();
2109 if loaded_digest != *collection_digest {
2110 return Err(anyhow!(
2111 "Collection digest mismatch: expected {}, got {}",
2112 String::from_utf8_lossy(collection_digest),
2113 String::from_utf8_lossy(&loaded_digest)
2114 ));
2115 }
2116
2117 let record = SequenceCollectionRecord::from(collection.clone());
2119
2120 self.collections.insert(*collection_digest, record);
2122
2123 let mut name_map = HashMap::new();
2125 for sequence_record in &collection.sequences {
2126 let metadata = sequence_record.metadata();
2127 let sha512_key = metadata.sha512t24u.to_key();
2128 name_map.insert(metadata.name.clone(), sha512_key);
2129
2130 if !self.sequence_store.contains_key(&sha512_key) {
2132 self.sequence_store
2133 .insert(sha512_key, SequenceRecord::Stub(metadata.clone()));
2134 let md5_key = metadata.md5.to_key();
2136 self.md5_lookup.insert(md5_key, sha512_key);
2137 }
2138 }
2139 self.name_lookup.insert(*collection_digest, name_map);
2140 } else {
2141 let sequences_data: Vec<(SequenceMetadata, [u8; 32], [u8; 32])> =
2144 if let Some(SequenceCollectionRecord::Full { sequences, .. }) =
2145 self.collections.get(collection_digest)
2146 {
2147 sequences
2148 .iter()
2149 .map(|seq| {
2150 let metadata = seq.metadata().clone();
2151 let sha512_key = metadata.sha512t24u.to_key();
2152 let md5_key = metadata.md5.to_key();
2153 (metadata, sha512_key, md5_key)
2154 })
2155 .collect()
2156 } else {
2157 Vec::new()
2158 };
2159
2160 let mut name_map = HashMap::new();
2162 for (metadata, sha512_key, md5_key) in sequences_data {
2163 name_map.insert(metadata.name.clone(), sha512_key);
2164
2165 if !self.sequence_store.contains_key(&sha512_key) {
2167 self.sequence_store
2168 .insert(sha512_key, SequenceRecord::Stub(metadata));
2169 self.md5_lookup.insert(md5_key, sha512_key);
2170 }
2171 }
2172 self.name_lookup.insert(*collection_digest, name_map);
2173 }
2174
2175 Ok(())
2176 }
2177
2178 fn ensure_sequence_loaded(&mut self, digest: &[u8; 32]) -> Result<()> {
2181 let record = self
2183 .sequence_store
2184 .get(digest)
2185 .ok_or_else(|| anyhow!("Sequence not found in store"))?;
2186
2187 if matches!(record, SequenceRecord::Full { .. }) {
2189 return Ok(());
2190 }
2191
2192 let digest_str = &record.metadata().sha512t24u;
2194 let template = self
2195 .seqdata_path_template
2196 .as_ref()
2197 .ok_or_else(|| anyhow!("No sequence data path template configured"))?;
2198
2199 let relative_path = template
2201 .replace("%s2", &digest_str[0..2])
2202 .replace("%s4", &digest_str[0..4])
2203 .replace("%s", digest_str);
2204
2205 if !self.quiet {
2208 let cached = self
2209 .local_path
2210 .as_ref()
2211 .map(|p| p.join(&relative_path).exists())
2212 .unwrap_or(false);
2213 let verb = if cached { "Loading" } else { "Downloading" };
2214 eprintln!("{} sequence {}...", verb, digest_str);
2215 }
2216 let data = Self::fetch_file(
2217 &self.local_path,
2218 &self.remote_source,
2219 &relative_path,
2220 self.persist_to_disk,
2221 )?;
2222
2223 self.sequence_store.entry(*digest).and_modify(|r| {
2225 r.load_data(data);
2226 });
2227
2228 Ok(())
2229 }
2230
2231 pub fn write(&self) -> Result<()> {
2256 if !self.persist_to_disk {
2257 return Err(anyhow!(
2258 "write() only works with disk-backed stores - use write_store_to_dir() instead"
2259 ));
2260 }
2261
2262 self.write_index_files()
2264 }
2265
2266 pub fn write_store_to_dir<P: AsRef<Path>>(
2268 &self,
2269 root_path: P,
2270 seqdata_path_template: Option<&str>,
2271 ) -> Result<()> {
2272 let root_path = root_path.as_ref();
2273
2274 let template = seqdata_path_template
2276 .or(self.seqdata_path_template.as_deref())
2277 .unwrap_or(DEFAULT_SEQDATA_PATH_TEMPLATE);
2278
2279 println!(
2280 "Writing store to directory: {}; Using seqdata path template: {}",
2281 root_path.display(),
2282 template
2283 );
2284
2285 fs::create_dir_all(root_path)?;
2287
2288 let sequences_dir = root_path.join("sequences");
2290 fs::create_dir_all(&sequences_dir)?;
2291
2292 let collections_dir = root_path.join("collections");
2294 fs::create_dir_all(&collections_dir)?;
2295
2296 for record in self.sequence_store.values() {
2298 match record {
2299 SequenceRecord::Full { metadata, .. } => {
2300 let rel_path = Self::get_sequence_path(&metadata.sha512t24u, template);
2302 let full_path = root_path.join(&rel_path);
2303
2304 record.to_file(full_path)?;
2306 }
2307 SequenceRecord::Stub(_metadata) => {
2308 continue;
2310 }
2311 }
2312 }
2313
2314 for record in self.collections.values() {
2316 let collection_file_path =
2317 root_path.join(format!("collections/{}.rgsi", record.metadata().digest));
2318 record.write_collection_rgsi(&collection_file_path)?;
2319 }
2320
2321 let sequence_index_path = root_path.join("sequences.rgsi");
2323 self.write_sequences_rgsi(&sequence_index_path)?;
2324
2325 let collection_index_path = root_path.join("collections.rgci");
2327 self.write_collections_rgci(&collection_index_path)?;
2328
2329 let metadata = StoreMetadata {
2331 version: 1,
2332 seqdata_path_template: template.to_string(),
2333 collections_path_template: "collections/%s.rgsi".to_string(),
2334 sequence_index: "sequences.rgsi".to_string(),
2335 collection_index: Some("collections.rgci".to_string()),
2336 mode: self.mode,
2337 created_at: Utc::now().to_rfc3339(),
2338 };
2339
2340 let json = serde_json::to_string_pretty(&metadata)
2342 .context("Failed to serialize metadata to JSON")?;
2343 fs::write(root_path.join("rgstore.json"), json).context("Failed to write rgstore.json")?;
2344
2345 Ok(())
2346 }
2347
2348 pub fn stats(&self) -> (usize, usize, &'static str) {
2356 let n_sequences = self.sequence_store.len();
2357 let n_collections_loaded = self
2358 .collections
2359 .values()
2360 .filter(|record| record.has_sequences())
2361 .count();
2362 let mode_str = match self.mode {
2363 StorageMode::Raw => "Raw",
2364 StorageMode::Encoded => "Encoded",
2365 };
2366 (n_sequences, n_collections_loaded, mode_str)
2367 }
2368
2369 pub fn stats_extended(&self) -> StoreStats {
2371 let n_sequences = self.sequence_store.len();
2372 let n_sequences_loaded = self
2373 .sequence_store
2374 .values()
2375 .filter(|record| record.is_loaded())
2376 .count();
2377 let n_collections = self.collections.len();
2378 let n_collections_loaded = self
2379 .collections
2380 .values()
2381 .filter(|record| record.has_sequences())
2382 .count();
2383 let mode_str = match self.mode {
2384 StorageMode::Raw => "Raw",
2385 StorageMode::Encoded => "Encoded",
2386 };
2387 let total_disk_size = self.actual_disk_usage();
2388 StoreStats {
2389 n_sequences,
2390 n_sequences_loaded,
2391 n_collections,
2392 n_collections_loaded,
2393 storage_mode: mode_str.to_string(),
2394 total_disk_size,
2395 }
2396 }
2397}
2398
2399#[derive(Debug, Clone)]
2401pub struct StoreStats {
2402 pub n_sequences: usize,
2404 pub n_sequences_loaded: usize,
2406 pub n_collections: usize,
2408 pub n_collections_loaded: usize,
2410 pub storage_mode: String,
2412 pub total_disk_size: usize,
2414}
2415
2416fn format_bytes(bytes: usize) -> String {
2418 const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
2419 let mut size = bytes as f64;
2420 let mut unit_idx = 0;
2421
2422 while size >= 1024.0 && unit_idx < UNITS.len() - 1 {
2423 size /= 1024.0;
2424 unit_idx += 1;
2425 }
2426
2427 if unit_idx == 0 {
2428 format!("{} {}", bytes, UNITS[0])
2429 } else {
2430 format!("{:.2} {}", size, UNITS[unit_idx])
2431 }
2432}
2433
2434impl Display for RefgetStore {
2435 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
2436 let total_size = self.total_disk_size();
2437 let size_str = format_bytes(total_size);
2438 writeln!(f, "SeqColStore object:")?;
2439 writeln!(f, " Mode: {:?}", self.mode)?;
2440 writeln!(f, " Disk size: {} ({} bytes)", size_str, total_size)?;
2441 writeln!(f, ">Sequences (n={}):", self.sequence_store.len())?;
2442 for (i, (sha512_digest, sequence_record)) in self.sequence_store.iter().take(10).enumerate()
2444 {
2445 let metadata = sequence_record.metadata();
2446 let first_8_chars = match sequence_record {
2447 SequenceRecord::Stub(_) => "<stub>".to_string(),
2448 SequenceRecord::Full {
2449 metadata,
2450 sequence: seq,
2451 } => {
2452 match self.mode {
2454 StorageMode::Encoded => {
2455 let alphabet = lookup_alphabet(&metadata.alphabet);
2456 let decoded = decode_substring_from_bytes(
2457 seq,
2458 0,
2459 8.min(metadata.length),
2460 alphabet,
2461 );
2462 String::from_utf8(decoded).unwrap_or_else(|_| "???".to_string())
2463 }
2464 StorageMode::Raw => String::from_utf8(seq[0..8.min(seq.len())].to_vec())
2465 .unwrap_or_else(|_| "???".to_string()),
2466 }
2467 }
2468 };
2469
2470 writeln!(
2471 f,
2472 " - {}. {:02x?}, MD5: {:02x?}, Length: {}, Alphabet: {:?}, Start: {}",
2473 i + 1,
2474 std::str::from_utf8(sha512_digest).unwrap(),
2475 &metadata.md5,
2476 &metadata.length,
2477 &metadata.alphabet,
2478 first_8_chars
2479 )?;
2480 }
2481 writeln!(f, ">Collections (n={:?}):", self.name_lookup.len())?;
2482 for (i, (digest, name_map)) in self.name_lookup.iter().enumerate() {
2484 let seqcol_digest_str = String::from_utf8_lossy(digest);
2486 writeln!(
2487 f,
2488 " {}. Collection Digest: {:02x?} ({} sequences)",
2489 i + 1,
2490 seqcol_digest_str,
2491 name_map.len()
2492 )?;
2493 for (name, sha512_digest) in name_map.iter().take(5) {
2495 let sha512_str = String::from_utf8_lossy(sha512_digest);
2497 writeln!(f, " - Name: {}, SHA512: {:02x?}", name, sha512_str)?;
2498 }
2499 if name_map.len() > 5 {
2500 writeln!(f, " - ... and {} more", name_map.len() - 5)?;
2501 }
2502 }
2503
2504 Ok(())
2505 }
2506}
2507
2508#[cfg(test)]
2509mod tests {
2510 use super::*;
2511 use crate::collection::{
2513 SequenceCollection, SequenceCollectionMetadata, SequenceMetadata, SequenceRecord,
2514 };
2515 use crate::digest::{md5, sha512t24u};
2516 use tempfile::tempdir;
2517
2518 fn calculate_test_digests(sequence: &[u8]) -> (String, String) {
2522 (sha512t24u(sequence), md5(sequence))
2523 }
2524
2525 fn setup_export_test_store(temp_path: &std::path::Path) -> (RefgetStore, [u8; 32]) {
2527 let fasta_content = ">chr1\nATGCATGCATGC\n>chr2\nGGGGAAAA\n>chr3\nTTTTCCCC\n";
2528 let temp_fasta_path = temp_path.join("test.fa");
2529 fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
2530
2531 let mut store = RefgetStore::in_memory();
2532 store
2533 .add_sequence_collection_from_fasta(&temp_fasta_path)
2534 .unwrap();
2535
2536 let collections: Vec<_> = store.collections.keys().cloned().collect();
2537 let collection_digest = collections[0];
2538
2539 (store, collection_digest)
2540 }
2541
2542 #[test]
2543 fn test_mode_basics() {
2544 let mut store = RefgetStore::in_memory();
2546
2547 assert_eq!(store.mode, StorageMode::Encoded);
2549
2550 store.disable_encoding();
2552 assert_eq!(store.mode, StorageMode::Raw);
2553 store.enable_encoding();
2554 assert_eq!(store.mode, StorageMode::Encoded);
2555
2556 store.set_encoding_mode(StorageMode::Raw);
2558 assert_eq!(store.mode, StorageMode::Raw);
2559 store.set_encoding_mode(StorageMode::Encoded);
2560 assert_eq!(store.mode, StorageMode::Encoded);
2561 }
2562
2563 #[test]
2564 fn test_mode_switching() {
2565 let temp_dir = tempdir().expect("Failed to create temporary directory");
2566 let temp_path = temp_dir.path();
2567 let fasta_content = ">chr1\nATGCATGCATGC\n>chr2\nGGGGAAAA\n";
2568 let temp_fasta_path = temp_path.join("test.fa");
2569 fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
2570
2571 let (chr1_sha, _) = calculate_test_digests(b"ATGCATGCATGC");
2572 let chr1_key = chr1_sha.as_bytes().to_key();
2573
2574 {
2576 let mut store = RefgetStore::in_memory();
2577 store.disable_encoding();
2578 store
2579 .add_sequence_collection_from_fasta(&temp_fasta_path)
2580 .unwrap();
2581
2582 if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2584 {
2585 assert_eq!(sequence, b"ATGCATGCATGC");
2586 }
2587 let seq_before = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2588
2589 store.set_encoding_mode(StorageMode::Encoded);
2591
2592 if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2594 {
2595 assert_eq!(sequence.len(), 3); }
2597 let seq_after = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2598 assert_eq!(seq_before, seq_after);
2599 }
2600
2601 {
2603 let mut store = RefgetStore::in_memory();
2604 store
2605 .add_sequence_collection_from_fasta(&temp_fasta_path)
2606 .unwrap();
2607
2608 if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2610 {
2611 assert_eq!(sequence.len(), 3);
2612 }
2613 let seq_before = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2614
2615 store.disable_encoding();
2617
2618 if let Some(SequenceRecord::Full { sequence, .. }) = store.sequence_store.get(&chr1_key)
2620 {
2621 assert_eq!(sequence, b"ATGCATGCATGC");
2622 }
2623 let seq_after = store.get_sequence(&chr1_sha).unwrap().decode().unwrap();
2624 assert_eq!(seq_before, seq_after);
2625 }
2626 }
2627
2628 #[test]
2629 fn test_refget_store_retrieve_seq_and_vec() {
2630 let temp_dir = tempdir().expect("Failed to create temporary directory");
2632 let temp_path = temp_dir.path();
2633
2634 let fasta_content = "\
2636>chr1
2637ATGCATGCATGC
2638>chr2
2639GGGGAAAA
2640";
2641 let temp_fasta_path = temp_path.join("test.fa");
2642
2643 fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
2644
2645 let mut store = RefgetStore::in_memory();
2647 store
2648 .add_sequence_collection_from_fasta(&temp_fasta_path)
2649 .unwrap();
2650
2651 let sequence_keys: Vec<[u8; 32]> = store.sequence_store.keys().cloned().collect();
2652
2653 let _ = sequence_keys[0]; let _ = sequence_keys[1]; let collection_digest_ref: &str = "uC_UorBNf3YUu1YIDainBhI94CedlNeH";
2656
2657 let (chr1_sha, chr1_md5) = calculate_test_digests(b"ATGCATGCATGC");
2659 let (chr2_sha, chr2_md5) = calculate_test_digests(b"GGGGAAAA");
2660 println!("chr1 values: {} {}", chr1_sha, chr1_md5);
2661 println!("chr2 values: {} {}", chr2_sha, chr2_md5);
2662
2663 let bed_content = "\
2666chr1\t0\t5
2667chr1\t8\t12
2668chr2\t0\t4
2669";
2670 let temp_bed_path = temp_path.join("test.bed");
2671
2672 fs::write(&temp_bed_path, bed_content).expect("Failed to write test BED file");
2673
2674 let temp_output_fa_path = temp_path.join("output.fa");
2675
2676 store
2677 .export_fasta_from_regions(
2678 collection_digest_ref,
2679 temp_bed_path.to_str().unwrap(),
2680 temp_output_fa_path.to_str().unwrap(),
2681 )
2682 .expect("export_fasta_from_regions failed");
2683
2684 let output_fa_content =
2686 fs::read_to_string(&temp_output_fa_path).expect("Failed to read output FASTA file");
2687
2688 let expected_fa_content = format!(
2690 ">chr1 12 dna2bit {} {}\nATGCAATGC\n>chr2 8 dna2bit {} {}\nGGGG\n",
2691 chr1_sha, chr1_md5, chr2_sha, chr2_md5
2692 );
2693 assert_eq!(
2694 output_fa_content.trim(),
2695 expected_fa_content.trim(),
2696 "Output FASTA file content mismatch"
2697 );
2698 println!("✓ export_fasta_from_regions test passed.");
2699
2700 let vec_result: Vec<_> = store
2702 .substrings_from_regions(collection_digest_ref, temp_bed_path.to_str().unwrap())
2703 .expect("substrings_from_regions failed")
2704 .collect::<Result<Vec<_>, _>>()
2705 .expect("substrings_from_regions had errors");
2706
2707 let expected_vec = vec![
2709 RetrievedSequence {
2710 sequence: "ATGCA".to_string(),
2711 chrom_name: "chr1".to_string(),
2712 start: 0,
2713 end: 5,
2714 },
2715 RetrievedSequence {
2716 sequence: "ATGC".to_string(),
2717 chrom_name: "chr1".to_string(),
2718 start: 8,
2719 end: 12,
2720 },
2721 RetrievedSequence {
2722 sequence: "GGGG".to_string(),
2723 chrom_name: "chr2".to_string(),
2724 start: 0,
2725 end: 4,
2726 },
2727 ];
2728
2729 assert_eq!(
2731 vec_result, expected_vec,
2732 "Retrieved sequence vector mismatch"
2733 );
2734 println!("✓ substrings_from_regions test passed.");
2735 }
2736
2737 #[test]
2738 fn test_global_refget_store() {
2739 let sequence = b"ACGT";
2740 let name = "test_seq";
2741 println!("Testing RefgetStore with sequence: {}", name);
2742
2743 let mut collection = SequenceCollection {
2745 metadata: SequenceCollectionMetadata {
2746 digest: "test_collection".to_string(),
2747 n_sequences: 0,
2748 names_digest: "test".to_string(),
2749 sequences_digest: "test".to_string(),
2750 lengths_digest: "test".to_string(),
2751 file_path: None,
2752 },
2753 sequences: Vec::new(),
2754 };
2755
2756 let seq_metadata = SequenceMetadata {
2758 name: name.to_string(),
2759 description: None,
2760 length: sequence.len(),
2761 sha512t24u: sha512t24u(sequence),
2762 md5: md5(sequence),
2763 alphabet: AlphabetType::Dna2bit,
2764 fai: None,
2765 };
2766
2767 let record = SequenceRecord::Full {
2768 metadata: seq_metadata.clone(),
2769 sequence: sequence.to_vec(),
2770 };
2771
2772 collection.sequences.push(record);
2773
2774 let mut store = RefgetStore::in_memory();
2776 store.add_sequence_collection(collection.clone()).unwrap();
2777
2778 assert!(!store.sequence_store.is_empty());
2780
2781 let retrieved_by_name_str = store.get_sequence_by_name(&collection.metadata.digest, name);
2783 assert!(retrieved_by_name_str.is_ok());
2784 let retrieved_record = retrieved_by_name_str.unwrap();
2785 assert_eq!(retrieved_record.metadata().name, name);
2786 assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2787
2788 let retrieved_by_name_key =
2790 store.get_sequence_by_name(collection.metadata.digest.to_key(), name);
2791 assert!(retrieved_by_name_key.is_ok());
2792 let retrieved_record = retrieved_by_name_key.unwrap();
2793 assert_eq!(retrieved_record.metadata().name, name);
2794 assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2795
2796 let retrieved_by_sha512_str = store.get_sequence(&seq_metadata.sha512t24u);
2798 assert!(retrieved_by_sha512_str.is_ok());
2799 let retrieved_record = retrieved_by_sha512_str.unwrap();
2800 assert_eq!(retrieved_record.metadata().name, name);
2801 assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2802
2803 let retrieved_by_sha512_key = store.get_sequence(seq_metadata.sha512t24u.to_key());
2805 assert!(retrieved_by_sha512_key.is_ok());
2806 let retrieved_record = retrieved_by_sha512_key.unwrap();
2807 assert_eq!(retrieved_record.metadata().name, name);
2808 assert_eq!(retrieved_record.sequence().unwrap(), sequence);
2809 }
2810
2811 #[test]
2812 fn test_import_fasta() {
2813 let temp_dir = tempdir().expect("Failed to create temporary directory");
2814 let temp_path = temp_dir.path();
2815
2816 let test_fa = "../tests/data/fasta/base.fa";
2818 let temp_fa = temp_path.join("base.fa");
2819
2820 std::fs::copy(test_fa, &temp_fa).expect("Failed to copy test FASTA file");
2821
2822 let mut store = RefgetStore::in_memory();
2823
2824 store.add_sequence_collection_from_fasta(temp_fa).unwrap();
2826
2827 assert!(!store.sequence_store.is_empty());
2829
2830 let seq_template = "sequences/%s2/%s.seq";
2832 store
2833 .write_store_to_dir(temp_path.to_str().unwrap(), Some(seq_template))
2834 .unwrap();
2835 }
2836
2837 #[test]
2838 fn test_disk_persistence() {
2839 let temp_dir = tempdir().unwrap();
2841 let temp_path = temp_dir.path();
2842 let temp_fasta = temp_path.join("base.fa.gz");
2843 std::fs::copy("../tests/data/fasta/base.fa.gz", &temp_fasta)
2844 .expect("Failed to copy base.fa.gz to tempdir");
2845
2846 let mut store = RefgetStore::in_memory();
2848
2849 store
2852 .add_sequence_collection_from_fasta(&temp_fasta)
2853 .unwrap();
2854
2855 let sequence_keys: Vec<[u8; 32]> = store.sequence_store.keys().cloned().collect();
2857 assert_eq!(
2858 sequence_keys.len(),
2859 3,
2860 "Test file should contain exactly 3 sequences"
2861 );
2862
2863 let sha512_key1 = sequence_keys[0];
2864 let sha512_key2 = sequence_keys[1];
2865
2866 let original_seq1 = store.sequence_store.get(&sha512_key1).unwrap().clone();
2868 let original_seq2 = store.sequence_store.get(&sha512_key2).unwrap().clone();
2869
2870 let seq_template = "sequences/%s2/%s.seq";
2872 store
2873 .write_store_to_dir(temp_path, Some(seq_template))
2874 .unwrap();
2875
2876 assert!(temp_path.join("sequences").exists());
2878 assert!(temp_path.join("sequences").read_dir().unwrap().count() > 0);
2879 assert!(temp_path.join("rgstore.json").exists());
2880 assert!(temp_path.join("sequences.rgsi").exists());
2881 assert!(temp_path.join("collections.rgci").exists());
2882 assert!(temp_path.join("collections").exists());
2883
2884 let mut loaded_store = RefgetStore::open_local(temp_path).unwrap();
2886
2887 assert_eq!(loaded_store.sequence_store.len(), 3);
2889
2890 assert!(loaded_store.sequence_store.contains_key(&sha512_key1));
2892 assert!(loaded_store.sequence_store.contains_key(&sha512_key2));
2893
2894 let loaded_seq1 = loaded_store.sequence_store.get(&sha512_key1).unwrap();
2896 let loaded_seq2 = loaded_store.sequence_store.get(&sha512_key2).unwrap();
2897
2898 assert_eq!(original_seq1.metadata().name, loaded_seq1.metadata().name);
2900 assert_eq!(
2901 original_seq1.metadata().length,
2902 loaded_seq1.metadata().length
2903 );
2904 assert_eq!(
2905 original_seq1.metadata().sha512t24u,
2906 loaded_seq1.metadata().sha512t24u
2907 );
2908 assert_eq!(original_seq1.metadata().md5, loaded_seq1.metadata().md5);
2909
2910 assert_eq!(original_seq2.metadata().name, loaded_seq2.metadata().name);
2911 assert_eq!(
2912 original_seq2.metadata().length,
2913 loaded_seq2.metadata().length
2914 );
2915 assert_eq!(
2916 original_seq2.metadata().sha512t24u,
2917 loaded_seq2.metadata().sha512t24u
2918 );
2919 assert_eq!(original_seq2.metadata().md5, loaded_seq2.metadata().md5);
2920
2921 assert_eq!(
2923 loaded_seq1.is_loaded(),
2924 false,
2925 "Data should not be loaded initially with lazy loading"
2926 );
2927 assert_eq!(
2928 loaded_seq2.is_loaded(),
2929 false,
2930 "Data should not be loaded initially with lazy loading"
2931 );
2932
2933 assert_eq!(loaded_store.md5_lookup.len(), 3);
2935
2936 assert_eq!(loaded_store.collections.len(), store.collections.len());
2938
2939 for (digest, original_record) in &store.sequence_store {
2941 let loaded_record = loaded_store.get_sequence(*digest).unwrap();
2942 assert_eq!(
2943 original_record.metadata().name,
2944 loaded_record.metadata().name
2945 );
2946 assert_eq!(
2947 original_record.metadata().length,
2948 loaded_record.metadata().length
2949 );
2950
2951 if original_record.metadata().length > 0 {
2953 let substring_len = std::cmp::min(5, original_record.metadata().length);
2954 let substring = loaded_store.get_substring(digest, 0, substring_len);
2955 assert!(
2956 substring.is_ok(),
2957 "Should be able to retrieve substring from loaded sequence"
2958 );
2959 }
2960 }
2961
2962 println!("✓ Disk persistence test passed - all data preserved correctly");
2963 }
2964
2965 #[test]
2966 fn test_export_fasta_all_sequences() {
2967 let temp_dir = tempdir().expect("Failed to create temporary directory");
2968 let (mut store, collection_digest) = setup_export_test_store(temp_dir.path());
2969
2970 let output_path = temp_dir.path().join("exported_all.fa");
2971 store
2972 .export_fasta(&collection_digest, &output_path, None, Some(80))
2973 .unwrap();
2974
2975 let exported = fs::read_to_string(&output_path).unwrap();
2976 assert!(
2977 exported.contains(">chr1") && exported.contains(">chr2") && exported.contains(">chr3")
2978 );
2979 assert!(
2980 exported.contains("ATGCATGCATGC")
2981 && exported.contains("GGGGAAAA")
2982 && exported.contains("TTTTCCCC")
2983 );
2984 }
2985
2986 #[test]
2987 fn test_export_fasta_subset_sequences() {
2988 let temp_dir = tempdir().expect("Failed to create temporary directory");
2989 let (mut store, collection_digest) = setup_export_test_store(temp_dir.path());
2990
2991 let output_path = temp_dir.path().join("exported_subset.fa");
2992 store
2993 .export_fasta(
2994 &collection_digest,
2995 &output_path,
2996 Some(vec!["chr1", "chr3"]),
2997 Some(80),
2998 )
2999 .unwrap();
3000
3001 let exported = fs::read_to_string(&output_path).unwrap();
3002 assert!(exported.contains(">chr1") && exported.contains(">chr3"));
3003 assert!(!exported.contains(">chr2") && !exported.contains("GGGGAAAA"));
3004 }
3005
3006 #[test]
3007 fn test_export_fasta_roundtrip() {
3008 let temp_dir = tempdir().expect("Failed to create temporary directory");
3009 let temp_path = temp_dir.path();
3010
3011 let fasta_content = "\
3013>seq1
3014ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC
3015ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC
3016>seq2
3017GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGGAAAACCCC
3018";
3019 let temp_fasta_path = temp_path.join("original.fa");
3020 fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
3021
3022 let mut store1 = RefgetStore::in_memory();
3024 store1
3025 .add_sequence_collection_from_fasta(&temp_fasta_path)
3026 .unwrap();
3027
3028 let original_digests: Vec<String> = store1
3030 .sequence_store
3031 .values()
3032 .map(|r| r.metadata().sha512t24u.clone())
3033 .collect();
3034
3035 let collections: Vec<_> = store1.collections.keys().cloned().collect();
3037 let collection_digest = collections[0];
3038 let exported_path = temp_path.join("exported.fa");
3039 store1
3040 .export_fasta(&collection_digest, &exported_path, None, Some(60))
3041 .expect("Failed to export FASTA");
3042
3043 let mut store2 = RefgetStore::in_memory();
3045 store2
3046 .add_sequence_collection_from_fasta(&exported_path)
3047 .unwrap();
3048
3049 let new_digests: Vec<String> = store2
3051 .sequence_store
3052 .values()
3053 .map(|r| r.metadata().sha512t24u.clone())
3054 .collect();
3055
3056 assert_eq!(
3057 original_digests.len(),
3058 new_digests.len(),
3059 "Should have same number of sequences"
3060 );
3061 for digest in original_digests {
3062 assert!(
3063 new_digests.contains(&digest),
3064 "Digest {} should be present after roundtrip",
3065 digest
3066 );
3067 }
3068
3069 println!("✓ Export/import roundtrip test passed");
3070 }
3071
3072 #[test]
3073 fn test_export_fasta_by_digests() {
3074 let temp_dir = tempdir().expect("Failed to create temporary directory");
3075 let (mut store, _) = setup_export_test_store(temp_dir.path());
3076
3077 let digests: Vec<String> = store
3078 .sequence_store
3079 .values()
3080 .map(|r| r.metadata().sha512t24u.clone())
3081 .collect();
3082 let digest_refs: Vec<&str> = digests.iter().map(|s| s.as_str()).collect();
3083
3084 let output_path = temp_dir.path().join("exported_by_digests.fa");
3085 store
3086 .export_fasta_by_digests(digest_refs, &output_path, Some(80))
3087 .unwrap();
3088
3089 let exported = fs::read_to_string(&output_path).unwrap();
3090 assert!(
3091 exported.contains(">chr1") && exported.contains(">chr2") && exported.contains(">chr3")
3092 );
3093 }
3094
3095 #[test]
3096 fn test_export_fasta_error_handling() {
3097 let temp_dir = tempdir().expect("Failed to create temporary directory");
3098 let (mut store, collection_digest) = setup_export_test_store(temp_dir.path());
3099
3100 let output_path = temp_dir.path().join("should_fail.fa");
3101
3102 let fake_collection = b"fake_collection_digest_12345678";
3104 assert!(
3105 store
3106 .export_fasta(fake_collection, &output_path, None, Some(80))
3107 .is_err()
3108 );
3109
3110 assert!(
3112 store
3113 .export_fasta(
3114 &collection_digest,
3115 &output_path,
3116 Some(vec!["nonexistent_chr"]),
3117 Some(80)
3118 )
3119 .is_err()
3120 );
3121 }
3122
3123 #[test]
3124 fn test_export_fasta_after_load_local() {
3125 let temp_dir = tempdir().expect("Failed to create temporary directory");
3128 let temp_path = temp_dir.path();
3129 let store_path = temp_path.join("store");
3130
3131 let fasta_content = ">chr1\nACGTACGT\n>chr2\nGGGGAAAA\n";
3133 let fasta_path = temp_path.join("test.fa");
3134 fs::write(&fasta_path, fasta_content).unwrap();
3135
3136 let collection_digest: [u8; 32];
3138 {
3139 let mut store = RefgetStore::on_disk(&store_path).unwrap();
3140 store
3141 .add_sequence_collection_from_fasta(&fasta_path)
3142 .unwrap();
3143 let collections: Vec<_> = store.collections.keys().cloned().collect();
3144 assert_eq!(collections.len(), 1, "Should have exactly one collection");
3145 collection_digest = collections[0];
3146 } let mut loaded_store = RefgetStore::open_local(&store_path).unwrap();
3150
3151 assert!(
3153 !loaded_store.is_collection_loaded(&collection_digest),
3154 "Collection should be Stub after loading from disk"
3155 );
3156
3157 let output_path = temp_path.join("exported.fa");
3159 loaded_store
3160 .export_fasta(&collection_digest, &output_path, None, Some(80))
3161 .expect("export_fasta should work on disk-loaded stores");
3162
3163 let exported = fs::read_to_string(&output_path).unwrap();
3165 assert!(exported.contains(">chr1"));
3166 assert!(exported.contains("ACGTACGT"));
3167 assert!(exported.contains(">chr2"));
3168 assert!(exported.contains("GGGGAAAA"));
3169
3170 println!("✓ export_fasta after load_local test passed");
3171 }
3172
3173 #[test]
3174 fn test_sequence_names_with_spaces() {
3175 let temp_dir = tempdir().expect("Failed to create temporary directory");
3178 let temp_path = temp_dir.path();
3179
3180 let fasta_content = "\
3183>JAHKSE010000016.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000016.1:1:100:1
3184ATGCATGCATGCATGCATGCATGCATGCATGCATGC
3185ATGCATGCATGCATGCATGCATGCATGCATGCATGC
3186>JAHKSE010000012.1 unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000012.1:1:100:1
3187GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGG
3188GGGGAAAACCCCTTTTGGGGAAAACCCCTTTTGGGG
3189";
3190 let temp_fasta_path = temp_path.join("spaces_in_names.fa");
3191 fs::write(&temp_fasta_path, fasta_content).expect("Failed to write test FASTA file");
3192
3193 let mut store = RefgetStore::in_memory();
3195 store
3196 .add_sequence_collection_from_fasta(&temp_fasta_path)
3197 .expect("Should parse FASTA headers correctly");
3198
3199 assert_eq!(store.sequence_store.len(), 2);
3201
3202 let name1 = "JAHKSE010000016.1";
3204 let name2 = "JAHKSE010000012.1";
3205
3206 let collections: Vec<_> = store.collections.keys().cloned().collect();
3208 assert_eq!(collections.len(), 1);
3209 let collection_digest = collections[0];
3210
3211 {
3214 let seq1 = store.get_sequence_by_name(&collection_digest, name1);
3215 assert!(
3216 seq1.is_ok(),
3217 "Should retrieve sequence by name (first word)"
3218 );
3219
3220 let seq1_meta = seq1.unwrap().metadata();
3221 assert_eq!(seq1_meta.name, "JAHKSE010000016.1");
3222 assert_eq!(
3223 seq1_meta.description,
3224 Some(
3225 "unmasked:primary_assembly HG002.alt.pat.f1_v2:JAHKSE010000016.1:1:100:1"
3226 .to_string()
3227 )
3228 );
3229 }
3230
3231 {
3232 let seq2 = store.get_sequence_by_name(&collection_digest, name2);
3233 assert!(
3234 seq2.is_ok(),
3235 "Should retrieve sequence by name (first word)"
3236 );
3237 }
3238
3239 println!("✓ FASTA header parsing test passed");
3240 }
3241
3242 #[test]
3243 fn test_rgsi_filename_with_dots() {
3244 let temp_dir = tempdir().expect("Failed to create temporary directory");
3249 let temp_path = temp_dir.path();
3250
3251 let test_file = "../tests/data/fasta/HG002.alt.pat.f1_v2.unmasked.fa";
3253 let temp_fasta = temp_path.join("HG002.alt.pat.f1_v2.unmasked.fa");
3254 fs::copy(test_file, &temp_fasta).expect("Failed to copy test file");
3255
3256 let mut store = RefgetStore::in_memory();
3258 store
3259 .add_sequence_collection_from_fasta(&temp_fasta)
3260 .expect("Should load FASTA");
3261
3262 let correct_rgsi = temp_path.join("HG002.alt.pat.f1_v2.unmasked.rgsi");
3264 let wrong_rgsi = temp_path.join("HG002.rgsi");
3265
3266 let files: Vec<_> = std::fs::read_dir(temp_path)
3267 .unwrap()
3268 .map(|e| e.unwrap().file_name().to_string_lossy().to_string())
3269 .collect();
3270
3271 assert!(
3272 correct_rgsi.exists(),
3273 "Expected 'HG002.alt.pat.f1_v2.unmasked.rgsi' but found: {:?}",
3274 files
3275 );
3276
3277 assert!(
3278 !wrong_rgsi.exists(),
3279 "Should NOT create 'HG002.rgsi' (strips too many dots)"
3280 );
3281
3282 println!("✓ RGSI filename with dots test passed");
3283 }
3284
3285 #[test]
3286 fn test_on_disk_collection_written_incrementally() {
3287 let temp_dir = tempdir().unwrap();
3290 let temp_path = temp_dir.path();
3291 let temp_fasta = temp_path.join("base.fa.gz");
3292 std::fs::copy("../tests/data/fasta/base.fa.gz", &temp_fasta)
3293 .expect("Failed to copy base.fa.gz to tempdir");
3294
3295 let cache_path = temp_path.join("cache");
3296 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3297
3298 store
3300 .add_sequence_collection_from_fasta(&temp_fasta)
3301 .unwrap();
3302
3303 let collections_dir = cache_path.join("collections");
3305 assert!(
3306 collections_dir.exists(),
3307 "Collections directory should exist"
3308 );
3309
3310 let rgsi_files: Vec<_> = std::fs::read_dir(&collections_dir)
3311 .unwrap()
3312 .map(|e| e.unwrap().file_name().to_string_lossy().to_string())
3313 .collect();
3314
3315 assert!(
3316 !rgsi_files.is_empty(),
3317 "Collection RGSI files should be written incrementally, found: {:?}",
3318 rgsi_files
3319 );
3320 assert!(
3321 rgsi_files.iter().any(|f| f.ends_with(".rgsi")),
3322 "Should have .rgsi files"
3323 );
3324
3325 println!("✓ On-disk collection incremental write test passed");
3326 }
3327
3328 #[test]
3329 fn test_disk_size_calculation() {
3330 let mut store = RefgetStore::in_memory();
3331 store
3332 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3333 .unwrap();
3334
3335 let disk_size = store.total_disk_size();
3336 assert!(disk_size > 0, "Disk size should be greater than 0");
3337
3338 let manual: usize = store
3340 .list_sequences()
3341 .iter()
3342 .map(|m| (m.length * m.alphabet.bits_per_symbol()).div_ceil(8))
3343 .sum();
3344 assert_eq!(disk_size, manual);
3345 }
3346
3347 #[test]
3348 fn test_incremental_index_writing() {
3349 let temp_dir = tempdir().unwrap();
3350 let cache_path = temp_dir.path().join("store");
3351 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3352
3353 store
3354 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3355 .unwrap();
3356
3357 assert!(
3359 cache_path.join("rgstore.json").exists(),
3360 "rgstore.json should exist"
3361 );
3362 assert!(
3363 cache_path.join("sequences.rgsi").exists(),
3364 "sequences.rgsi should exist"
3365 );
3366 assert!(
3367 cache_path.join("collections.rgci").exists(),
3368 "collections.rgci should exist"
3369 );
3370
3371 let _loaded = RefgetStore::on_disk(&cache_path).unwrap();
3373 }
3374
3375 #[test]
3376 fn test_write_method() {
3377 let temp_dir = tempdir().unwrap();
3378 let cache_path = temp_dir.path().join("store");
3379 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3380
3381 store
3382 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3383 .unwrap();
3384 store.write().unwrap(); assert!(cache_path.join("rgstore.json").exists());
3387 }
3388
3389 #[test]
3390 fn test_on_disk_smart_constructor() {
3391 let temp_dir = tempdir().unwrap();
3392 let cache_path = temp_dir.path().join("store");
3393
3394 let mut store1 = RefgetStore::on_disk(&cache_path).unwrap();
3396 assert_eq!(store1.mode, StorageMode::Encoded);
3397 store1
3398 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3399 .unwrap();
3400
3401 let store2 = RefgetStore::on_disk(&cache_path).unwrap();
3403 assert_eq!(store2.sequence_store.len(), store1.sequence_store.len());
3404 assert_eq!(
3405 store2.mode,
3406 StorageMode::Encoded,
3407 "Loaded store should preserve Encoded mode"
3408 );
3409
3410 let cache_path_raw = temp_dir.path().join("store_raw");
3412 let mut store3 = RefgetStore::on_disk(&cache_path_raw).unwrap();
3413 store3.disable_encoding(); assert_eq!(store3.mode, StorageMode::Raw);
3415 store3
3416 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3417 .unwrap();
3418
3419 let store4 = RefgetStore::on_disk(&cache_path_raw).unwrap();
3421 assert_eq!(
3422 store4.mode,
3423 StorageMode::Raw,
3424 "Loaded store should preserve Raw mode"
3425 );
3426
3427 let index_path = cache_path_raw.join("rgstore.json");
3429 let json = fs::read_to_string(&index_path).unwrap();
3430 assert!(
3431 json.contains("\"mode\":\"Raw\"") || json.contains("\"mode\": \"Raw\""),
3432 "rgstore.json should contain mode: Raw"
3433 );
3434 }
3435
3436 #[test]
3437 fn test_collection_metadata_methods() {
3438 let temp_dir = tempdir().unwrap();
3440 let cache_path = temp_dir.path().join("store");
3441 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3442
3443 store
3445 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3446 .unwrap();
3447
3448 let collections = store.list_collections();
3450 assert_eq!(collections.len(), 1, "Should have 1 collection");
3451 let digest = collections[0].digest.clone();
3452
3453 let meta = store.get_collection_metadata(&digest);
3455 assert!(meta.is_some(), "Should get collection metadata");
3456 let meta = meta.unwrap();
3457 assert_eq!(meta.n_sequences, 3, "Collection should have 3 sequences");
3458
3459 assert!(
3461 store.is_collection_loaded(&digest),
3462 "Collection should be loaded (Full)"
3463 );
3464
3465 let stats = store.stats_extended();
3467 assert_eq!(stats.n_collections, 1, "Should have 1 collection total");
3468 assert_eq!(
3469 stats.n_collections_loaded, 1,
3470 "Should have 1 collection loaded"
3471 );
3472 assert_eq!(stats.n_sequences, 3, "Should have 3 sequences");
3473
3474 println!("✓ Collection metadata methods test passed");
3475 }
3476
3477 #[test]
3478 fn test_collection_stub_lazy_loading() {
3479 let temp_dir = tempdir().unwrap();
3481 let cache_path = temp_dir.path().join("store");
3482
3483 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3485 store
3486 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3487 .unwrap();
3488 let digest = store.list_collections()[0].digest.clone();
3489
3490 drop(store);
3492 let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3493
3494 let meta = loaded_store.get_collection_metadata(&digest);
3496 assert!(meta.is_some(), "Metadata should be available for Stub");
3497 assert_eq!(
3498 meta.unwrap().n_sequences,
3499 3,
3500 "Stub should know sequence count"
3501 );
3502
3503 assert!(
3505 !loaded_store.is_collection_loaded(&digest),
3506 "Collection should be Stub after loading from disk"
3507 );
3508
3509 let stats_before = loaded_store.stats_extended();
3511 assert_eq!(
3512 stats_before.n_collections, 1,
3513 "Should have 1 collection total"
3514 );
3515 assert_eq!(
3516 stats_before.n_collections_loaded, 0,
3517 "Should have 0 collections loaded initially"
3518 );
3519
3520 let seq = loaded_store.get_sequence_by_name(&digest, "chr1");
3522 assert!(
3523 seq.is_ok(),
3524 "Should be able to retrieve sequence after lazy load"
3525 );
3526 assert_eq!(seq.unwrap().metadata().name, "chr1");
3527
3528 assert!(
3530 loaded_store.is_collection_loaded(&digest),
3531 "Collection should be Full after accessing a sequence"
3532 );
3533
3534 let stats_after = loaded_store.stats_extended();
3536 assert_eq!(
3537 stats_after.n_collections_loaded, 1,
3538 "Should have 1 collection loaded after access"
3539 );
3540
3541 println!("✓ Collection stub lazy loading test passed");
3542 }
3543
3544 #[test]
3547 fn test_get_collection() {
3548 let temp_dir = tempdir().unwrap();
3550 let cache_path = temp_dir.path().join("store");
3551
3552 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3554 store
3555 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3556 .unwrap();
3557 let digest = store.list_collections()[0].digest.clone();
3558 drop(store);
3559
3560 let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3562
3563 assert!(!loaded_store.is_collection_loaded(&digest));
3565
3566 let stats_before = loaded_store.stats_extended();
3568 assert_eq!(
3569 stats_before.n_sequences_loaded, 0,
3570 "No sequences should be loaded initially"
3571 );
3572
3573 let collection = loaded_store.get_collection(&digest).unwrap();
3575 assert!(
3576 !collection.sequences.is_empty(),
3577 "Collection should have sequences"
3578 );
3579 assert_eq!(collection.sequences.len(), 3);
3580
3581 let stats_after = loaded_store.stats_extended();
3583 assert_eq!(
3584 stats_after.n_sequences_loaded, 0,
3585 "Sequences not loaded until explicitly fetched"
3586 );
3587 assert_eq!(
3588 stats_after.n_collections_loaded, 1,
3589 "Collection should be loaded"
3590 );
3591
3592 for record in loaded_store.sequence_store.values() {
3594 assert!(
3595 !record.is_loaded(),
3596 "Sequences should be stubs after get_collection"
3597 );
3598 }
3599
3600 let seq_digest = collection.sequences[0].metadata().sha512t24u.clone();
3602 let loaded_seq = loaded_store.get_sequence(&seq_digest).unwrap();
3603 assert!(
3604 loaded_seq.is_loaded(),
3605 "Sequence should be loaded after get_sequence"
3606 );
3607
3608 println!("✓ get_collection test passed");
3609 }
3610
3611 #[test]
3612 fn test_get_sequence() {
3613 let temp_dir = tempdir().unwrap();
3615 let cache_path = temp_dir.path().join("store");
3616
3617 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3619 store
3620 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3621 .unwrap();
3622
3623 let seq_digest = store
3625 .sequence_store
3626 .values()
3627 .next()
3628 .unwrap()
3629 .metadata()
3630 .sha512t24u
3631 .clone();
3632 drop(store);
3633
3634 let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3636
3637 let seq_before = loaded_store
3639 .sequence_store
3640 .get(&seq_digest.to_key())
3641 .unwrap();
3642 assert!(
3643 !seq_before.is_loaded(),
3644 "Sequence should not have data before get_sequence"
3645 );
3646
3647 let loaded_seq = loaded_store.get_sequence(&seq_digest).unwrap();
3649 assert!(
3650 loaded_seq.is_loaded(),
3651 "Sequence should have data after get_sequence"
3652 );
3653 assert!(
3654 loaded_seq.sequence().is_some(),
3655 "Sequence data should be available"
3656 );
3657
3658 println!("✓ get_sequence test passed");
3659 }
3660
3661 #[test]
3662 fn test_get_collection_idempotent() {
3663 let temp_dir = tempdir().unwrap();
3665 let cache_path = temp_dir.path().join("store");
3666
3667 let mut store = RefgetStore::on_disk(&cache_path).unwrap();
3669 store
3670 .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa.gz")
3671 .unwrap();
3672 let digest = store.list_collections()[0].digest.clone();
3673 drop(store);
3674
3675 let mut loaded_store = RefgetStore::open_local(&cache_path).unwrap();
3677
3678 let result1 = loaded_store.get_collection(&digest);
3680 assert!(result1.is_ok(), "First get should succeed");
3681
3682 let result2 = loaded_store.get_collection(&digest);
3683 assert!(result2.is_ok(), "Second get should also succeed");
3684
3685 assert_eq!(loaded_store.stats_extended().n_collections_loaded, 1);
3687
3688 println!("✓ get_collection idempotent test passed");
3689 }
3690
3691 #[test]
3692 fn test_sanitize_relative_path_rejects_traversal() {
3693 assert!(RefgetStore::sanitize_relative_path("../etc/passwd").is_err());
3694 assert!(RefgetStore::sanitize_relative_path("foo/../bar").is_err());
3695 assert!(RefgetStore::sanitize_relative_path("foo/../../bar").is_err());
3696 assert!(RefgetStore::sanitize_relative_path("..").is_err());
3697 }
3698
3699 #[test]
3700 fn test_sanitize_relative_path_rejects_absolute() {
3701 assert!(RefgetStore::sanitize_relative_path("/etc/passwd").is_err());
3702 assert!(RefgetStore::sanitize_relative_path("\\windows\\system32").is_err());
3703 }
3704
3705 #[test]
3706 fn test_sanitize_relative_path_accepts_valid() {
3707 assert!(RefgetStore::sanitize_relative_path("sequences/ab/abc123.seq").is_ok());
3708 assert!(RefgetStore::sanitize_relative_path("collections/xyz.rgsi").is_ok());
3709 assert!(RefgetStore::sanitize_relative_path("rgstore.json").is_ok());
3710 assert!(RefgetStore::sanitize_relative_path("sequences/%s2/%s.seq").is_ok());
3711 }
3712
3713 #[test]
3714 fn test_stale_rgsi_cache_is_ignored() {
3715 use std::io::Write;
3718
3719 let temp_dir = tempdir().unwrap();
3720
3721 let fasta_path = temp_dir.path().join("test.fa");
3723 let mut fasta_file = fs::File::create(&fasta_path).unwrap();
3724 writeln!(fasta_file, ">chr1\nATGCATGC\n>chr2\nGGGGAAAA").unwrap();
3725
3726 let rgsi_path = temp_dir.path().join("test.rgsi");
3728 let mut rgsi_file = fs::File::create(&rgsi_path).unwrap();
3729 writeln!(
3730 rgsi_file,
3731 "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
3732 )
3733 .unwrap();
3734
3735 let store_path = temp_dir.path().join("store");
3737 let mut store = RefgetStore::on_disk(&store_path).unwrap();
3738
3739 let result = store.add_sequence_collection_from_fasta(&fasta_path);
3742 assert!(
3743 result.is_ok(),
3744 "Should handle stale cache: {:?}",
3745 result.err()
3746 );
3747
3748 assert_eq!(store.sequence_store.len(), 2, "Should have 2 sequences");
3750
3751 println!("✓ Stale RGSI cache test passed");
3752 }
3753}