Skip to main content

refget_store/
lib.rs

1//! Storage traits and implementations for refget sequences and sequence collections.
2
3pub mod fasta;
4mod memory;
5mod mmap;
6mod seqcol_store;
7
8pub use fasta::{DigestCache, FastaSequenceStore, FastaSequenceSummary, SeqColCache, SidecarCache};
9pub use memory::InMemorySequenceStore;
10pub use mmap::MmapSequenceStore;
11pub use seqcol_store::InMemorySeqColStore;
12
13use std::path::{Path, PathBuf};
14
15use refget_model::SequenceMetadata;
16use serde::{Deserialize, Serialize};
17
18/// Errors from store operations.
19#[derive(Debug, thiserror::Error)]
20pub enum StoreError {
21    #[error("I/O error: {0}")]
22    Io(#[from] std::io::Error),
23    #[error("FASTA index error: {0}")]
24    Fasta(String),
25    #[error("Sequence not found: {0}")]
26    NotFound(String),
27}
28
29/// Result type for store operations.
30pub type StoreResult<T> = Result<T, StoreError>;
31
32/// Extract a subsequence from `seq` given optional 0-based half-open `start`/`end`.
33/// Clamps `end` to the sequence length. Returns empty if `start >= seq.len()`.
34pub(crate) fn extract_subsequence(seq: &[u8], start: Option<u64>, end: Option<u64>) -> Vec<u8> {
35    let start = start.unwrap_or(0) as usize;
36    let end = end.unwrap_or(seq.len() as u64) as usize;
37    let end = end.min(seq.len());
38    if start >= seq.len() {
39        return vec![];
40    }
41    seq[start..end].to_vec()
42}
43
44/// Trait for retrieving sequences and their metadata.
45pub trait SequenceStore: Send + Sync {
46    /// Retrieve sequence bases by digest (MD5 or sha512t24u).
47    /// Supports optional start/end for subsequence retrieval (0-based, half-open).
48    fn get_sequence(
49        &self,
50        digest: &str,
51        start: Option<u64>,
52        end: Option<u64>,
53    ) -> StoreResult<Option<Vec<u8>>>;
54
55    /// Retrieve metadata for a sequence by digest.
56    fn get_metadata(&self, digest: &str) -> StoreResult<Option<SequenceMetadata>>;
57
58    /// Retrieve the length of a sequence by digest.
59    fn get_length(&self, digest: &str) -> StoreResult<Option<u64>>;
60}
61
62/// Result from listing collections.
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct ListResult {
65    pub items: Vec<ListItem>,
66    pub total: usize,
67    pub page: usize,
68    pub page_size: usize,
69}
70
71/// A single item in a collection listing.
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct ListItem {
74    pub digest: String,
75}
76
77/// Trait for storing and retrieving sequence collections.
78pub trait SeqColStore: Send + Sync {
79    /// Get a collection by its Level 0 digest.
80    fn get_collection(&self, digest: &str) -> Option<&refget_model::SeqCol>;
81
82    /// List collections with optional attribute-based filters, paginated.
83    fn list_collections(
84        &self,
85        filters: &[(String, String)],
86        page: usize,
87        page_size: usize,
88    ) -> ListResult;
89
90    /// Get a single attribute array by attribute name and its digest.
91    fn get_attribute(&self, name: &str, digest: &str) -> Option<serde_json::Value>;
92
93    /// Return the total number of collections.
94    fn count(&self) -> usize;
95}
96
97/// Collect FASTA files from a list of paths (files or directories).
98///
99/// Directories are searched non-recursively for files with FASTA extensions
100/// (`.fa`, `.fasta`, `.fna`, `.fas`). Results are sorted by path.
101pub fn collect_fasta_files(paths: &[PathBuf]) -> StoreResult<Vec<PathBuf>> {
102    let mut files = Vec::new();
103    for path in paths {
104        if path.is_dir() {
105            let entries = std::fs::read_dir(path)?;
106            for entry in entries {
107                let p = entry?.path();
108                if is_fasta_file(&p) {
109                    files.push(p);
110                }
111            }
112        } else if path.is_file() {
113            files.push(path.clone());
114        } else {
115            return Err(StoreError::Fasta(format!("Path does not exist: {}", path.display())));
116        }
117    }
118    files.sort();
119    Ok(files)
120}
121
122/// Check if a path has a FASTA file extension.
123pub fn is_fasta_file(path: &Path) -> bool {
124    matches!(path.extension().and_then(|e| e.to_str()), Some("fa" | "fasta" | "fna" | "fas"))
125}