Skip to main content

gtars_refget/
lib.rs

1//! # Rust implementation of GA4GH Refget sequence collection functions
2
3//! This module provides functions managing and retrieving sequences from a sequence collection.
4//!
5//! # Module Structure
6//!
7//! The library is organized into two main parts:
8//!
9//! ## Core (WASM-compatible)
10//!
11//! The `digest` module contains all WASM-compatible code:
12//! - `digest::algorithms` - Hash functions (sha512t24u, md5, canonicalize_json)
13//! - `digest::alphabet` - Sequence alphabets and encoding tables
14//! - `digest::encoder` - Sequence bit-packing
15//! - `digest::types` - Core data structures (SequenceRecord, SequenceCollection)
16//! - `digest::fasta` - Bytes-based FASTA parsing
17//! - `digest::stream` - Streaming FASTA hasher for chunk-by-chunk processing
18//!
19//! ## Filesystem (requires `filesystem` feature)
20//!
21//! - `fasta` - File-based FASTA parsing (wraps digest::fasta with file I/O)
22//! - `collection` - Extended SequenceCollection with filesystem operations
23//! - `store` - RefgetStore for persistent sequence storage
24//!
25//! # Feature Flags
26//!
27//! - `filesystem` (default): Enables file-based operations
28//! - Without `filesystem`: Only WASM-compatible code in `digest` module
29
30// ============================================================================
31// Core WASM-compatible module
32// ============================================================================
33
34/// Core digest and encoding functionality - WASM-safe.
35/// All code in this module works without filesystem access.
36pub mod digest;
37
38// Re-export commonly used items from digest at crate root for convenience
39pub use digest::{
40    ASCII_ALPHABET,
41    // Alphabet
42    Alphabet,
43    AlphabetGuesser,
44    AlphabetType,
45    DNA_2BIT_ALPHABET,
46    DNA_3BIT_ALPHABET,
47    DNA_IUPAC_ALPHABET,
48    FaiMetadata,
49    // Streaming
50    FastaStreamHasher,
51    PROTEIN_ALPHABET,
52    ParseOptions,
53    SeqColDigestLvl1,
54    SequenceCollection,
55    SequenceCollectionMetadata,
56    SequenceCollectionRecord,
57    SequenceEncoder,
58    SequenceMetadata,
59    // Types
60    SequenceRecord,
61    canonicalize_json,
62    decode_string_from_bytes,
63    decode_substring_from_bytes,
64    // Fasta (bytes-based, WASM-compatible)
65    digest_fasta_bytes,
66    digest_sequence,
67    digest_sequence_with_description,
68    // Encoder
69    encode_sequence,
70    guess_alphabet,
71    load_fasta_bytes,
72    lookup_alphabet,
73    md5,
74    parse_fasta_header,
75    parse_rgsi_line,
76    // Algorithms
77    sha512t24u,
78};
79
80// ============================================================================
81// Filesystem-dependent modules (require `filesystem` feature)
82// ============================================================================
83
84/// File-based FASTA operations.
85/// Wraps the WASM-compatible digest::fasta with filesystem I/O.
86#[cfg(feature = "filesystem")]
87pub mod fasta;
88
89/// Extended SequenceCollection with filesystem operations.
90/// Adds methods for RGSI file I/O, caching, and file-based construction.
91#[cfg(feature = "filesystem")]
92pub mod collection;
93
94/// Persistent sequence storage (RefgetStore).
95#[cfg(feature = "filesystem")]
96pub mod store;
97
98/// Seqcol spec operations (comparison, level-based retrieval, attribute search).
99#[cfg(feature = "filesystem")]
100pub mod seqcol;
101
102// Internal modules for filesystem operations
103#[cfg(feature = "filesystem")]
104mod hashkeyable;
105#[cfg(feature = "filesystem")]
106mod utils;
107
108// Re-export filesystem functions at crate root for backward compatibility
109#[cfg(feature = "filesystem")]
110pub use collection::{
111    SequenceCollectionExt, SequenceCollectionRecordExt, SequenceMetadataExt, SequenceRecordExt,
112    read_rgsi_file,
113};
114#[cfg(feature = "filesystem")]
115pub use fasta::{FaiRecord, compute_fai, digest_fasta, load_fasta};
116#[cfg(feature = "filesystem")]
117pub use store::{FhrAuthor, FhrIdentifier, FhrMetadata, FhrTaxon, FhrVitalStats};
118#[cfg(feature = "filesystem")]
119pub use seqcol::SeqColService;
120#[cfg(feature = "filesystem")]
121pub use store::{AvailableAliases, PagedResult, Pagination, PullResult, SyncStrategy};
122
123// ============================================================================
124// Tests
125// ============================================================================
126
127#[cfg(all(test, feature = "filesystem"))]
128mod tests {
129    use super::*;
130
131    use std::time::Instant;
132    use store::{FastaImportOptions, RefgetStore};
133    use tempfile::tempdir;
134
135    #[test]
136    #[ignore]
137    fn test_loading_large_fasta_file() {
138        // Path to a large FASTA file
139        let fasta_path =
140            std::env::var("FASTA_PATH").expect("FASTA_PATH environment variable not set");
141        println!("Loading large FASTA file: {}", &fasta_path);
142
143        // Create a new sequence store, and dd sequences to the store
144        println!("Adding sequences from FASTA file...");
145        let start = Instant::now();
146        let mut store = RefgetStore::in_memory();
147        store
148            .add_sequence_collection_from_fasta(&fasta_path, FastaImportOptions::new())
149            .unwrap();
150        let duration = start.elapsed();
151        println!("Time taken to load: {:.2?}", duration);
152
153        let mut store2 = RefgetStore::in_memory();
154        store2.disable_encoding(); // Switch to Raw mode
155        store2
156            .add_sequence_collection_from_fasta(&fasta_path, FastaImportOptions::new())
157            .unwrap();
158
159        // Get list of sequences
160        let sequences: Vec<_> = store.sequence_digests().collect();
161        assert!(!sequences.is_empty(), "No sequences found in the store");
162
163        // Look up the first sequence by digest
164        println!("Look up a sequence by digest...");
165        let digest = &sequences[0];
166        let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data");
167
168        // Test retrieval of a substring
169        println!("Retrieving a substring of sequence named: {:?}", digest_str);
170        let start_basic = 0;
171        let end_basic = 3;
172        let substring = store.get_substring(digest, start_basic, end_basic);
173        assert!(
174            substring.is_ok(),
175            "Failed to retrieve substring with name: {:?}",
176            digest_str
177        );
178        println!("Retrieved substring: {:?}", substring.unwrap());
179
180        // Retrieve substring via digest
181        let start = 148 * 70;
182        let end = 148 * 70 + 70;
183        let substring2 = store.get_substring(digest, start, end);
184        assert!(
185            substring2.is_ok(),
186            "Failed to retrieve substring with name: {:?}",
187            digest_str
188        );
189
190        let substring3 = store2.get_substring(digest, start, end);
191        assert_eq!(substring2.as_ref().unwrap(), substring3.as_ref().unwrap());
192        println!("Retrieved substring: {:?}", substring2.unwrap());
193        println!("Retrieved substring: {:?}", substring3.unwrap());
194    }
195
196    #[test]
197    fn test_get_sequence_encoded() {
198        let temp_dir = tempdir().expect("Failed to create temporary directory");
199        let temp_path = temp_dir.path();
200        // Create a new sequence store
201        let mut store = RefgetStore::in_memory();
202        let fasta_path = "../tests/data/fasta/base.fa.gz";
203        let temp_fasta = temp_path.join("base.fa.gz");
204        std::fs::copy(fasta_path, &temp_fasta).expect("Failed to copy base.fa.gz to tempdir");
205
206        // Add sequences to the store
207        store
208            .add_sequence_collection_from_fasta(temp_fasta, FastaImportOptions::new())
209            .unwrap();
210        println!("Listing sequences in the store...");
211        let digest = "iYtREV555dUFKg2_agSJW6suquUyPpMw"; // from base.fa.gz
212        let digest_str = String::from_utf8(digest.as_bytes().to_vec()).expect("Invalid ASCII data");
213
214        // Test retrieval of a substring
215        println!("Retrieving a substring of sequence named: {:?}", digest_str);
216        let start = 2;
217        let end = start + 5;
218        let substring = store.get_substring(digest, start, end);
219        assert!(
220            substring.is_ok(),
221            "Failed to retrieve substring with name: {:?}",
222            digest_str
223        );
224        println!("Retrieved substring: {:?}", substring.as_ref().unwrap());
225        assert_eq!(substring.unwrap(), "GGGGA");
226
227        println!("Retrieving a substring of sequence named: {:?}", digest_str);
228        let start = 3;
229        let end = start + 2;
230        let substring = store.get_substring(digest, start, end);
231        assert!(
232            substring.is_ok(),
233            "Failed to retrieve substring with name: {:?}",
234            digest_str
235        );
236        println!("Retrieved substring: {:?}", substring.as_ref().unwrap());
237        assert_eq!(substring.unwrap(), "GG");
238    }
239}