Skip to main content

gtars_refget/store/
persistence.rs

1//! Disk I/O for RefgetStore: reading and writing index files,
2//! opening stores from disk, and loading sequences/collections.
3
4use super::*;
5use super::readonly::ReadonlyRefgetStore;
6use super::fhr_metadata;
7
8use std::collections::HashMap;
9use std::ffi::OsStr;
10
11use indexmap::IndexMap;
12use std::fs::{self, File, create_dir_all};
13use std::io::{BufRead, Write};
14use std::path::Path;
15
16use anyhow::{Context, Result};
17use sha2::{Sha256, Digest};
18
19use crate::collection::{
20    SequenceCollectionRecordExt,
21    read_rgsi_file,
22};
23use crate::digest::{
24    SequenceCollectionRecord, SequenceMetadata, SequenceRecord,
25    parse_rgci_line, parse_rgsi_line,
26};
27use crate::hashkeyable::HashKeyable;
28
29use chrono::Utc;
30
31// ============================================================================
32// ReadonlyRefgetStore disk I/O methods
33// ============================================================================
34
35impl ReadonlyRefgetStore {
36    /// Write a single sequence to disk using the configured path template
37    pub(crate) fn write_sequence_to_disk_single(
38        &self,
39        metadata: &SequenceMetadata,
40        sequence: &[u8],
41    ) -> Result<()> {
42        let template = self
43            .seqdata_path_template
44            .as_ref()
45            .context("seqdata_path_template not set")?;
46        let local_path = self.local_path.as_ref().context("local_path not set")?;
47
48        let seq_file_path = Self::expand_template(&metadata.sha512t24u, template);
49        let full_path = local_path.join(&seq_file_path);
50
51        if let Some(parent) = full_path.parent() {
52            create_dir_all(parent)?;
53        }
54
55        let mut file = File::create(&full_path)?;
56        file.write_all(sequence)?;
57
58        Ok(())
59    }
60
61    /// Write a single collection RGSI file to disk.
62    /// Used when persist_to_disk=true to persist collections incrementally.
63    pub(crate) fn write_collection_to_disk_single(&self, record: &SequenceCollectionRecord) -> Result<()> {
64        let local_path = self.local_path.as_ref().context("local_path not set")?;
65
66        let coll_file_path = format!("collections/{}.rgsi", record.metadata().digest);
67        let full_path = local_path.join(&coll_file_path);
68
69        if let Some(parent) = full_path.parent() {
70            create_dir_all(parent)?;
71        }
72
73        record.write_collection_rgsi(&full_path)?;
74
75        Ok(())
76    }
77
78    /// Write index files (sequences.rgsi, collections.rgci, and rgstore.json) to disk.
79    ///
80    /// This allows the store to be loaded later via open_local().
81    /// Called automatically when adding collections in disk-backed mode.
82    pub(crate) fn write_index_files(&self) -> Result<()> {
83        let local_path = self.local_path.as_ref().context("local_path not set")?;
84        let template = self
85            .seqdata_path_template
86            .as_ref()
87            .context("seqdata_path_template not set")?;
88
89        let sequence_index_path = local_path.join("sequences.rgsi");
90        self.write_sequences_rgsi(&sequence_index_path)?;
91
92        let collection_index_path = local_path.join("collections.rgci");
93        self.write_collections_rgci(&collection_index_path)?;
94
95        // Compute digests of the files we just wrote
96        let sequences_digest = Self::sha256_file(&sequence_index_path).ok();
97        let collections_digest = Self::sha256_file(&collection_index_path).ok();
98        let aliases_digest = self.compute_aliases_digest();
99        let fhr_digest = self.compute_fhr_digest();
100
101        self.write_rgstore_json_with_digests(
102            local_path, template,
103            collections_digest, sequences_digest,
104            aliases_digest, fhr_digest,
105        )?;
106
107        Ok(())
108    }
109
110    /// Write the rgstore.json metadata file to the given directory.
111    pub(crate) fn write_rgstore_json(&self, dir: &Path, seqdata_template: &str) -> Result<()> {
112        self.write_rgstore_json_with_digests(dir, seqdata_template, None, None, None, None)
113    }
114
115    /// Write the rgstore.json metadata file with state digests and modified timestamp.
116    pub(crate) fn write_rgstore_json_with_digests(
117        &self,
118        dir: &Path,
119        seqdata_template: &str,
120        collections_digest: Option<String>,
121        sequences_digest: Option<String>,
122        aliases_digest: Option<String>,
123        fhr_digest: Option<String>,
124    ) -> Result<()> {
125        let metadata = StoreMetadata {
126            version: 1,
127            seqdata_path_template: seqdata_template.to_string(),
128            collections_path_template: "collections/%s.rgsi".to_string(),
129            sequence_index: "sequences.rgsi".to_string(),
130            collection_index: Some("collections.rgci".to_string()),
131            mode: self.mode,
132            created_at: Utc::now().to_rfc3339(),
133            ancillary_digests: self.ancillary_digests,
134            attribute_index: self.attribute_index,
135            sequence_alias_namespaces: self.aliases.sequence_namespaces(),
136            collection_alias_namespaces: self.aliases.collection_namespaces(),
137            modified: Some(Utc::now().to_rfc3339()),
138            collections_digest,
139            sequences_digest,
140            aliases_digest,
141            fhr_digest,
142        };
143
144        let json = serde_json::to_string_pretty(&metadata)
145            .context("Failed to serialize metadata to JSON")?;
146        fs::write(dir.join("rgstore.json"), json).context("Failed to write rgstore.json")?;
147
148        Ok(())
149    }
150
151    /// Write collection metadata index (collections.rgci) to disk.
152    ///
153    /// Creates a master index of all collections with their metadata.
154    pub(crate) fn write_collections_rgci<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
155        let file_path = file_path.as_ref();
156        let mut file = File::create(file_path)?;
157
158        writeln!(
159            file,
160            "#digest\tn_sequences\tnames_digest\tsequences_digest\tlengths_digest\tname_length_pairs_digest\tsorted_name_length_pairs_digest\tsorted_sequences_digest"
161        )?;
162
163        for record in self.collections.values() {
164            let meta = record.metadata();
165            writeln!(
166                file,
167                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
168                meta.digest,
169                meta.n_sequences,
170                meta.names_digest,
171                meta.sequences_digest,
172                meta.lengths_digest,
173                meta.name_length_pairs_digest.as_deref().unwrap_or(""),
174                meta.sorted_name_length_pairs_digest.as_deref().unwrap_or(""),
175                meta.sorted_sequences_digest.as_deref().unwrap_or(""),
176            )?;
177        }
178        Ok(())
179    }
180
181    /// Write all sequence metadata to an RGSI file.
182    pub fn write_sequences_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
183        let file_path = file_path.as_ref();
184        let mut file = std::fs::File::create(file_path)?;
185
186        writeln!(
187            file,
188            "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
189        )?;
190
191        for result_sr in self.sequence_store.values() {
192            let result = result_sr.metadata();
193            let description = result.description.as_deref().unwrap_or("");
194            writeln!(
195                file,
196                "{}\t{}\t{}\t{}\t{}\t{}",
197                result.name,
198                result.length,
199                result.alphabet,
200                result.sha512t24u,
201                result.md5,
202                description
203            )?;
204        }
205        Ok(())
206    }
207
208    /// Read the store metadata from rgstore.json, returning state digests and timestamp.
209    ///
210    /// Returns a HashMap with keys: modified, collections_digest, sequences_digest,
211    /// aliases_digest, fhr_digest. Missing values are omitted from the map.
212    pub fn store_metadata(&self) -> Result<HashMap<String, String>> {
213        let local_path = self.local_path.as_ref().context("local_path not set")?;
214        let json = fs::read_to_string(local_path.join("rgstore.json"))
215            .context("Failed to read rgstore.json")?;
216        let metadata: StoreMetadata =
217            serde_json::from_str(&json).context("Failed to parse store metadata")?;
218
219        let mut map = HashMap::new();
220        if let Some(v) = metadata.modified { map.insert("modified".to_string(), v); }
221        if let Some(v) = metadata.collections_digest { map.insert("collections_digest".to_string(), v); }
222        if let Some(v) = metadata.sequences_digest { map.insert("sequences_digest".to_string(), v); }
223        if let Some(v) = metadata.aliases_digest { map.insert("aliases_digest".to_string(), v); }
224        if let Some(v) = metadata.fhr_digest { map.insert("fhr_digest".to_string(), v); }
225        Ok(map)
226    }
227
228    /// Compute SHA256 digest of a file's contents.
229    fn sha256_file(path: &Path) -> Result<String> {
230        let bytes = fs::read(path)?;
231        let hash = Sha256::digest(&bytes);
232        Ok(format!("{:x}", hash))
233    }
234
235    /// Compute a combined SHA256 digest of all alias namespace files (sorted).
236    /// Alias files live under aliases/sequences/*.tsv and aliases/collections/*.tsv.
237    fn compute_aliases_digest(&self) -> Option<String> {
238        let local_path = self.local_path.as_ref()?;
239        let aliases_dir = local_path.join("aliases");
240        if !aliases_dir.exists() { return None; }
241
242        let mut paths: Vec<_> = Vec::new();
243        for subdir in &["sequences", "collections"] {
244            let sub = aliases_dir.join(subdir);
245            if sub.exists() {
246                if let Ok(entries) = fs::read_dir(&sub) {
247                    for entry in entries.filter_map(|e| e.ok()) {
248                        let p = entry.path();
249                        if p.extension().map_or(false, |e| e == "tsv") {
250                            paths.push(p);
251                        }
252                    }
253                }
254            }
255        }
256        if paths.is_empty() { return None; }
257        paths.sort();
258
259        let mut hasher = Sha256::new();
260        for path in &paths {
261            hasher.update(fs::read(path).ok()?);
262        }
263        Some(format!("{:x}", hasher.finalize()))
264    }
265
266    /// Compute a combined SHA256 digest of all FHR sidecar files (sorted).
267    fn compute_fhr_digest(&self) -> Option<String> {
268        let local_path = self.local_path.as_ref()?;
269        let fhr_dir = local_path.join("fhr");
270        if !fhr_dir.exists() { return None; }
271
272        let mut paths: Vec<_> = fs::read_dir(&fhr_dir).ok()?
273            .filter_map(|e| e.ok())
274            .map(|e| e.path())
275            .collect();
276        if paths.is_empty() { return None; }
277        paths.sort();
278
279        let mut hasher = Sha256::new();
280        for path in paths {
281            hasher.update(fs::read(&path).ok()?);
282        }
283        Some(format!("{:x}", hasher.finalize()))
284    }
285
286    // =========================================================================
287    // Open methods
288    // =========================================================================
289
290    /// Open a local store (internal). Users should use RefgetStore::open_local().
291    pub(crate) fn open_local<P: AsRef<Path>>(path: P) -> Result<Self> {
292        let root_path = path.as_ref();
293
294        let index_path = root_path.join("rgstore.json");
295        let json = fs::read_to_string(&index_path).context(format!(
296            "Failed to read rgstore.json from {}",
297            index_path.display()
298        ))?;
299
300        let metadata: StoreMetadata =
301            serde_json::from_str(&json).context("Failed to parse store metadata")?;
302
303        Self::sanitize_relative_path(&metadata.seqdata_path_template)?;
304        Self::sanitize_relative_path(&metadata.sequence_index)?;
305        if let Some(ref ci) = metadata.collection_index {
306            Self::sanitize_relative_path(ci)?;
307        }
308
309        let mut store = ReadonlyRefgetStore::new(metadata.mode);
310        store.local_path = Some(root_path.to_path_buf());
311        store.seqdata_path_template = Some(metadata.seqdata_path_template.clone());
312        store.persist_to_disk = true;
313        store.ancillary_digests = metadata.ancillary_digests;
314        store.attribute_index = metadata.attribute_index;
315
316        let sequence_index_path = root_path.join(&metadata.sequence_index);
317        if sequence_index_path.exists() {
318            Self::load_sequences_from_index(&mut store, &sequence_index_path)?;
319        }
320
321        if let Some(ref collection_index) = metadata.collection_index {
322            let collection_index_path = root_path.join(collection_index);
323            if collection_index_path.exists() {
324                Self::load_collection_stubs_from_rgci(&mut store, &collection_index_path)?;
325            }
326        }
327
328        if store.collections.is_empty() {
329            let collections_dir = root_path.join("collections");
330            Self::load_collections_from_directory(&mut store, &collections_dir)?;
331        }
332
333        let aliases_dir = root_path.join("aliases");
334        store.aliases.load_from_dir(&aliases_dir)?;
335
336        store.available_sequence_alias_namespaces = if metadata.sequence_alias_namespaces.is_empty() {
337            store.aliases.sequence_namespaces()
338        } else {
339            metadata.sequence_alias_namespaces
340        };
341        store.available_collection_alias_namespaces = if metadata.collection_alias_namespaces.is_empty() {
342            store.aliases.collection_namespaces()
343        } else {
344            metadata.collection_alias_namespaces
345        };
346
347        store.fhr_metadata =
348            fhr_metadata::load_sidecars(&root_path.join("fhr"));
349
350        Ok(store)
351    }
352
353    /// Open a remote store (internal). Users should use RefgetStore::open_remote().
354    pub(crate) fn open_remote<P: AsRef<Path>, S: AsRef<str>>(
355        cache_path: P,
356        remote_url: S,
357    ) -> Result<Self> {
358        let cache_path = cache_path.as_ref();
359        let remote_url = remote_url.as_ref().to_string();
360
361        create_dir_all(cache_path)?;
362
363        let index_data = Self::fetch_file(
364            &Some(cache_path.to_path_buf()),
365            &Some(remote_url.clone()),
366            "rgstore.json",
367            true,
368            false,
369        )?;
370
371        let json =
372            String::from_utf8(index_data).context("Store metadata contains invalid UTF-8")?;
373
374        let metadata: StoreMetadata =
375            serde_json::from_str(&json).context("Failed to parse store metadata")?;
376
377        Self::sanitize_relative_path(&metadata.seqdata_path_template)?;
378        Self::sanitize_relative_path(&metadata.sequence_index)?;
379        if let Some(ref ci) = metadata.collection_index {
380            Self::sanitize_relative_path(ci)?;
381        }
382
383        let mut store = ReadonlyRefgetStore::new(metadata.mode);
384        store.local_path = Some(cache_path.to_path_buf());
385        store.remote_source = Some(remote_url.clone());
386        store.seqdata_path_template = Some(metadata.seqdata_path_template.clone());
387        store.persist_to_disk = true;
388        store.ancillary_digests = metadata.ancillary_digests;
389        store.attribute_index = metadata.attribute_index;
390        store.available_sequence_alias_namespaces = metadata.sequence_alias_namespaces;
391        store.available_collection_alias_namespaces = metadata.collection_alias_namespaces;
392
393        let sequence_index_data = Self::fetch_file(
394            &Some(cache_path.to_path_buf()),
395            &Some(remote_url.clone()),
396            &metadata.sequence_index,
397            true,
398            false,
399        )?;
400        let sequence_index_str = String::from_utf8(sequence_index_data)
401            .context("sequence index contains invalid UTF-8")?;
402
403        Self::load_sequences_from_reader(&mut store, sequence_index_str.as_bytes())?;
404
405        if let Some(ref collection_index) = metadata.collection_index {
406            if let Ok(collection_index_data) = Self::fetch_file(
407                &Some(cache_path.to_path_buf()),
408                &Some(remote_url.clone()),
409                collection_index,
410                true,
411                false,
412            ) {
413                let collection_index_str = String::from_utf8(collection_index_data)
414                    .context("collection index contains invalid UTF-8")?;
415
416                Self::load_collection_stubs_from_reader(
417                    &mut store,
418                    collection_index_str.as_bytes(),
419                )?;
420            }
421        }
422
423        if store.collections.is_empty() {
424            let local_collections_dir = cache_path.join("collections");
425            create_dir_all(&local_collections_dir)?;
426            Self::load_collections_from_directory(&mut store, &local_collections_dir)?;
427        }
428
429        Ok(store)
430    }
431
432    // =========================================================================
433    // Loading helpers
434    // =========================================================================
435
436    /// Parse RGSI lines from a reader and load as Stub sequence records.
437    pub(crate) fn load_sequences_from_reader<R: BufRead>(store: &mut ReadonlyRefgetStore, reader: R) -> Result<()> {
438        for line in reader.lines() {
439            let line = line?;
440
441            if line.starts_with('#') {
442                continue;
443            }
444
445            if let Some(seq_metadata) = parse_rgsi_line(&line) {
446                let record = SequenceRecord::Stub(seq_metadata.clone());
447
448                let sha512_key = seq_metadata.sha512t24u.to_key();
449                store.sequence_store.insert(sha512_key, record);
450
451                let md5_key = seq_metadata.md5.to_key();
452                store.md5_lookup.insert(md5_key, sha512_key);
453            }
454        }
455
456        Ok(())
457    }
458
459    /// Load sequence metadata from a sequence index file (sequences.rgsi).
460    pub(crate) fn load_sequences_from_index(store: &mut ReadonlyRefgetStore, index_path: &Path) -> Result<()> {
461        let file = std::fs::File::open(index_path)?;
462        let reader = std::io::BufReader::new(file);
463        Self::load_sequences_from_reader(store, reader)
464    }
465
466    /// Parse RGCI lines from a reader and load as Stub collection records.
467    pub(crate) fn load_collection_stubs_from_reader<R: BufRead>(
468        store: &mut ReadonlyRefgetStore,
469        reader: R,
470    ) -> Result<()> {
471        for line in reader.lines() {
472            let line = line?;
473
474            if let Some(metadata) = parse_rgci_line(&line) {
475                let key = metadata.digest.to_key();
476                store
477                    .collections
478                    .insert(key, SequenceCollectionRecord::Stub(metadata));
479            }
480        }
481
482        Ok(())
483    }
484
485    /// Load collection stubs from collections.rgci index file (new format).
486    pub(crate) fn load_collection_stubs_from_rgci(store: &mut ReadonlyRefgetStore, index_path: &Path) -> Result<()> {
487        let file = std::fs::File::open(index_path)?;
488        let reader = std::io::BufReader::new(file);
489        Self::load_collection_stubs_from_reader(store, reader)
490    }
491
492    /// Load full collections from a collections directory (fallback when no RGCI exists).
493    pub(crate) fn load_collections_from_directory(
494        store: &mut ReadonlyRefgetStore,
495        collections_dir: &Path,
496    ) -> Result<()> {
497        if !collections_dir.exists() {
498            return Ok(());
499        }
500
501        for entry in fs::read_dir(collections_dir)? {
502            let entry = entry?;
503            let path = entry.path();
504
505            if path.is_file() && path.extension() == Some(OsStr::new("rgsi")) {
506                let collection = read_rgsi_file(&path)?;
507                let collection_digest = collection.metadata.digest.to_key();
508
509                let mut name_map = IndexMap::new();
510                for sequence_record in &collection.sequences {
511                    let metadata = sequence_record.metadata();
512                    name_map.insert(metadata.name.clone(), metadata.sha512t24u.to_key());
513                }
514                store.name_lookup.insert(collection_digest, name_map);
515
516                let record = SequenceCollectionRecord::from(collection);
517                store.collections.insert(collection_digest, record);
518            }
519        }
520
521        Ok(())
522    }
523}