Skip to main content

gtars_refget/store/
mod.rs

1//! # RefgetStore
2//!
3//! A store for managing reference genome sequences with support for both
4//! in-memory and disk-backed storage.
5//!
6//! ## Two-Type Design
7//!
8//! - **`RefgetStore`** (wrapper): User-facing type with `&mut self` read methods
9//!   that automatically lazy-load data on first access. Use for CLI and scripts.
10//! - **`ReadonlyRefgetStore`** (inner): All reads are `&self`. Suitable for
11//!   `Arc<ReadonlyRefgetStore>` in servers. Requires explicit preloading.
12
13mod readonly;
14mod core;
15mod alias;
16mod fhr_metadata;
17mod import;
18mod persistence;
19mod export;
20
21#[cfg(test)]
22mod tests;
23
24// Re-export public types from submodules
25pub use self::readonly::ReadonlyRefgetStore;
26pub use self::core::RefgetStore;
27pub use self::alias::{AliasKind, AliasManager};
28pub use self::fhr_metadata::{
29    FhrMetadata, FhrAuthor, FhrIdentifier, FhrTaxon, FhrVitalStats,
30    // Disk I/O helpers used by persistence and externally
31    load_sidecars, write_sidecars, write_sidecar, remove_sidecar, sidecar_path, load_from_json,
32};
33
34use serde::{Deserialize, Serialize};
35use std::io::{BufRead, BufReader, Read};
36
37pub(crate) use crate::hashkeyable::DigestKey;
38
39
40// =========================================================================
41// Shared constants
42// =========================================================================
43
44pub(crate) const DEFAULT_SEQDATA_PATH_TEMPLATE: &str = "sequences/%s2/%s.seq";
45
46// =========================================================================
47// Shared types used across multiple submodules
48// =========================================================================
49
50/// Paginated result container matching the seqcol spec response format.
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct PagedResult<T> {
53    pub results: Vec<T>,
54    pub pagination: Pagination,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct Pagination {
59    pub page: usize,
60    pub page_size: usize,
61    pub total: usize,
62}
63
64/// Enum storing whether sequences will be stored in Raw or Encoded form
65#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq)]
66pub enum StorageMode {
67    Raw,
68    Encoded,
69}
70
71#[derive(Debug, Clone, PartialEq, Eq)]
72pub struct RetrievedSequence {
73    pub sequence: String,
74    pub chrom_name: String,
75    pub start: u32,
76    pub end: u32,
77}
78
79/// Options for importing a FASTA file into a RefgetStore.
80#[derive(Clone, Copy)]
81pub struct FastaImportOptions<'a> {
82    pub(crate) force: bool,
83    pub(crate) namespaces: &'a [&'a str],
84}
85
86impl<'a> Default for FastaImportOptions<'a> {
87    fn default() -> Self {
88        Self {
89            force: false,
90            namespaces: &[],
91        }
92    }
93}
94
95impl<'a> FastaImportOptions<'a> {
96    #[must_use]
97    pub fn new() -> Self {
98        Self::default()
99    }
100
101    #[must_use]
102    pub fn force(mut self, yes: bool) -> Self {
103        self.force = yes;
104        self
105    }
106
107    #[must_use]
108    pub fn namespaces(mut self, ns: &'a [&'a str]) -> Self {
109        self.namespaces = ns;
110        self
111    }
112}
113
114/// Metadata for the entire store.
115/// This is used to serialize metadata to `rgstore.json`, which can be loaded by the application.
116#[derive(Serialize, Deserialize, Debug)]
117pub(crate) struct StoreMetadata {
118    /// Version of the metadata format
119    pub(crate) version: u32,
120    /// Template for sequence file paths
121    pub(crate) seqdata_path_template: String,
122    /// Template for collection file paths
123    pub(crate) collections_path_template: String,
124    /// Path to the sequence metadata index file
125    pub(crate) sequence_index: String,
126    /// Path to the collection metadata index file (NEW)
127    #[serde(default)]
128    pub(crate) collection_index: Option<String>,
129    /// Storage mode (Raw or Encoded)
130    pub(crate) mode: StorageMode,
131    /// Creation timestamp
132    pub(crate) created_at: String,
133    /// Whether ancillary digests are computed and stored
134    #[serde(default = "default_true")]
135    pub(crate) ancillary_digests: bool,
136    /// Whether on-disk attribute index is maintained (Part 2)
137    #[serde(default)]
138    pub(crate) attribute_index: bool,
139    /// Available sequence alias namespaces (for remote discovery)
140    #[serde(default, skip_serializing_if = "Vec::is_empty")]
141    pub(crate) sequence_alias_namespaces: Vec<String>,
142    /// Available collection alias namespaces (for remote discovery)
143    #[serde(default, skip_serializing_if = "Vec::is_empty")]
144    pub(crate) collection_alias_namespaces: Vec<String>,
145    /// Last-modified timestamp (RFC 3339). Updated on every write_index_files().
146    #[serde(default, skip_serializing_if = "Option::is_none")]
147    pub(crate) modified: Option<String>,
148    /// SHA256 digest of collections.rgci
149    #[serde(default, skip_serializing_if = "Option::is_none")]
150    pub(crate) collections_digest: Option<String>,
151    /// SHA256 digest of sequences.rgsi
152    #[serde(default, skip_serializing_if = "Option::is_none")]
153    pub(crate) sequences_digest: Option<String>,
154    /// SHA256 digest of combined alias data
155    #[serde(default, skip_serializing_if = "Option::is_none")]
156    pub(crate) aliases_digest: Option<String>,
157    /// SHA256 digest of combined FHR sidecar data
158    #[serde(default, skip_serializing_if = "Option::is_none")]
159    pub(crate) fhr_digest: Option<String>,
160}
161
162pub(crate) fn default_true() -> bool {
163    true
164}
165
166/// Statistics for a RefgetStore
167#[derive(Debug, Clone)]
168pub struct StoreStats {
169    /// Total number of sequences (Stub + Full)
170    pub n_sequences: usize,
171    /// Number of sequences with data loaded (Full)
172    pub n_sequences_loaded: usize,
173    /// Total number of collections (Stub + Full)
174    pub n_collections: usize,
175    /// Number of collections with sequences loaded (Full)
176    pub n_collections_loaded: usize,
177    /// Storage mode (Raw or Encoded)
178    pub storage_mode: String,
179}
180
181/// Format bytes into human-readable size (KB, MB, GB, etc.)
182pub(crate) fn format_bytes(bytes: usize) -> String {
183    const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
184    let mut size = bytes as f64;
185    let mut unit_idx = 0;
186
187    while size >= 1024.0 && unit_idx < UNITS.len() - 1 {
188        size /= 1024.0;
189        unit_idx += 1;
190    }
191
192    if unit_idx == 0 {
193        format!("{} {}", bytes, UNITS[0])
194    } else {
195        format!("{:.2} {}", size, UNITS[unit_idx])
196    }
197}
198
199// =========================================================================
200// Sidecar sync types
201// =========================================================================
202
203/// Conflict resolution strategy for sidecar pull operations.
204#[derive(Debug, Clone, Copy, PartialEq, Eq)]
205pub enum SyncStrategy {
206    /// Skip fetch if local file exists (default). Local wins.
207    KeepOurs,
208    /// Always fetch from remote, overwriting local. Remote wins.
209    KeepTheirs,
210    /// Report what would change without fetching. Returns diff info.
211    Notify,
212}
213
214/// Result of a sidecar pull operation.
215#[derive(Debug, Default)]
216pub struct PullResult {
217    /// Successfully fetched from remote
218    pub pulled: usize,
219    /// Skipped (local exists, KeepOurs)
220    pub skipped: usize,
221    /// Remote 404 (no sidecar exists)
222    pub not_found: usize,
223    /// For Notify: paths that differ between local and remote
224    pub conflicts: Vec<String>,
225}
226
227/// Available alias namespaces advertised by a store's manifest.
228#[derive(Debug)]
229pub struct AvailableAliases<'a> {
230    pub sequences: &'a [String],
231    pub collections: &'a [String],
232}
233
234/// Iterator over BED file regions yielding substrings from a store.
235pub struct SubstringsFromRegions<'a, K>
236where
237    K: AsRef<[u8]>,
238{
239    pub(crate) store: &'a ReadonlyRefgetStore,
240    pub(crate) reader: BufReader<Box<dyn Read>>,
241    pub(crate) collection_digest: K,
242    pub(crate) previous_parsed_chr: String,
243    pub(crate) current_seq_digest: String,
244    pub(crate) line_num: usize,
245}