Skip to main content

gtars_refget/
lib.rs

1//! # Rust implementation of GA4GH Refget sequence collection functions
2
3//! This module provides functions managing and retrieving sequences from a sequence collection.
4//!
5//! # Module Structure
6//!
7//! The library is organized into two main parts:
8//!
9//! ## Core (WASM-compatible)
10//!
11//! The `digest` module contains all WASM-compatible code:
12//! - `digest::algorithms` - Hash functions (sha512t24u, md5, canonicalize_json)
13//! - `digest::alphabet` - Sequence alphabets and encoding tables
14//! - `digest::encoder` - Sequence bit-packing
15//! - `digest::types` - Core data structures (SequenceRecord, SequenceCollection)
16//! - `digest::fasta` - Bytes-based FASTA parsing
17//! - `digest::stream` - Streaming FASTA hasher for chunk-by-chunk processing
18//!
19//! ## Filesystem (requires `filesystem` feature)
20//!
21//! - `fasta` - File-based FASTA parsing (wraps digest::fasta with file I/O)
22//! - `collection` - Extended SequenceCollection with filesystem operations
23//! - `store` - RefgetStore for persistent sequence storage
24//!
25//! # Feature Flags
26//!
27//! - `filesystem` (default): Enables file-based operations
28//! - Without `filesystem`: Only WASM-compatible code in `digest` module
29
30// ============================================================================
31// Core WASM-compatible module
32// ============================================================================
33
34/// Core digest and encoding functionality - WASM-safe.
35/// All code in this module works without filesystem access.
36pub mod digest;
37
38// Re-export commonly used items from digest at crate root for convenience
39pub use digest::{
40    ASCII_ALPHABET,
41    // Alphabet
42    Alphabet,
43    AlphabetGuesser,
44    AlphabetType,
45    DNA_2BIT_ALPHABET,
46    DNA_3BIT_ALPHABET,
47    DNA_IUPAC_ALPHABET,
48    FaiMetadata,
49    // Streaming
50    FastaStreamHasher,
51    PROTEIN_ALPHABET,
52    ParseOptions,
53    SeqColDigestLvl1,
54    SequenceCollection,
55    SequenceCollectionMetadata,
56    SequenceCollectionRecord,
57    SequenceEncoder,
58    SequenceMetadata,
59    // Types
60    SequenceRecord,
61    canonicalize_json,
62    decode_string_from_bytes,
63    decode_substring_from_bytes,
64    // Fasta (bytes-based, WASM-compatible)
65    digest_fasta_bytes,
66    digest_sequence,
67    digest_sequence_with_description,
68    // Encoder
69    encode_sequence,
70    guess_alphabet,
71    load_fasta_bytes,
72    lookup_alphabet,
73    md5,
74    parse_fasta_header,
75    parse_rgsi_line,
76    // Algorithms
77    sha512t24u,
78};
79
80// ============================================================================
81// Filesystem-dependent modules (require `filesystem` feature)
82// ============================================================================
83
84/// File-based FASTA operations.
85/// Wraps the WASM-compatible digest::fasta with filesystem I/O.
86#[cfg(feature = "filesystem")]
87pub mod fasta;
88
89/// Extended SequenceCollection with filesystem operations.
90/// Adds methods for RGSI file I/O, caching, and file-based construction.
91#[cfg(feature = "filesystem")]
92pub mod collection;
93
94/// Persistent sequence storage (RefgetStore).
95#[cfg(feature = "filesystem")]
96pub mod store;
97
98// Internal modules for filesystem operations
99#[cfg(feature = "filesystem")]
100mod hashkeyable;
101#[cfg(feature = "filesystem")]
102mod utils;
103
104// Re-export filesystem functions at crate root for backward compatibility
105#[cfg(feature = "filesystem")]
106pub use collection::{
107    SequenceCollectionExt, SequenceCollectionRecordExt, SequenceMetadataExt, SequenceRecordExt,
108    read_rgsi_file,
109};
110#[cfg(feature = "filesystem")]
111pub use fasta::{FaiRecord, compute_fai, digest_fasta, load_fasta};
112
113// ============================================================================
114// Tests
115// ============================================================================
116
117#[cfg(all(test, feature = "filesystem"))]
118mod tests {
119    use super::*;
120
121    use std::time::Instant;
122    use store::RefgetStore;
123    use tempfile::tempdir;
124
125    #[test]
126    #[ignore]
127    fn test_loading_large_fasta_file() {
128        // Path to a large FASTA file
129        let fasta_path =
130            std::env::var("FASTA_PATH").expect("FASTA_PATH environment variable not set");
131        println!("Loading large FASTA file: {}", &fasta_path);
132
133        // Create a new sequence store, and dd sequences to the store
134        println!("Adding sequences from FASTA file...");
135        let start = Instant::now();
136        let mut store = RefgetStore::in_memory();
137        store
138            .add_sequence_collection_from_fasta(&fasta_path)
139            .unwrap();
140        let duration = start.elapsed();
141        println!("Time taken to load: {:.2?}", duration);
142
143        let mut store2 = RefgetStore::in_memory();
144        store2.disable_encoding(); // Switch to Raw mode
145        store2
146            .add_sequence_collection_from_fasta(&fasta_path)
147            .unwrap();
148
149        // Get list of sequences
150        let sequences: Vec<_> = store.sequence_digests().collect();
151        assert!(!sequences.is_empty(), "No sequences found in the store");
152
153        // Look up the first sequence by digest
154        println!("Look up a sequence by digest...");
155        let digest = &sequences[0];
156        let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data");
157
158        // Test retrieval of a substring
159        println!("Retrieving a substring of sequence named: {:?}", digest_str);
160        let start_basic = 0;
161        let end_basic = 3;
162        let substring = store.get_substring(digest, start_basic, end_basic);
163        assert!(
164            substring.is_ok(),
165            "Failed to retrieve substring with name: {:?}",
166            digest_str
167        );
168        println!("Retrieved substring: {:?}", substring.unwrap());
169
170        // Retrieve substring via digest
171        let start = 148 * 70;
172        let end = 148 * 70 + 70;
173        let substring2 = store.get_substring(digest, start, end);
174        assert!(
175            substring2.is_ok(),
176            "Failed to retrieve substring with name: {:?}",
177            digest_str
178        );
179
180        let substring3 = store2.get_substring(digest, start, end);
181        assert_eq!(substring2.as_ref().unwrap(), substring3.as_ref().unwrap());
182        println!("Retrieved substring: {:?}", substring2.unwrap());
183        println!("Retrieved substring: {:?}", substring3.unwrap());
184    }
185
186    #[test]
187    fn test_get_sequence_encoded() {
188        let temp_dir = tempdir().expect("Failed to create temporary directory");
189        let temp_path = temp_dir.path();
190        // Create a new sequence store
191        let mut store = RefgetStore::in_memory();
192        let fasta_path = "../tests/data/fasta/base.fa.gz";
193        let temp_fasta = temp_path.join("base.fa.gz");
194        std::fs::copy(fasta_path, &temp_fasta).expect("Failed to copy base.fa.gz to tempdir");
195
196        // Add sequences to the store
197        store
198            .add_sequence_collection_from_fasta(temp_fasta)
199            .unwrap();
200        println!("Listing sequences in the store...");
201        let digest = "iYtREV555dUFKg2_agSJW6suquUyPpMw"; // from base.fa.gz
202        let digest_str = String::from_utf8(digest.as_bytes().to_vec()).expect("Invalid ASCII data");
203
204        // Test retrieval of a substring
205        println!("Retrieving a substring of sequence named: {:?}", digest_str);
206        let start = 2;
207        let end = start + 5;
208        let substring = store.get_substring(digest, start, end);
209        assert!(
210            substring.is_ok(),
211            "Failed to retrieve substring with name: {:?}",
212            digest_str
213        );
214        println!("Retrieved substring: {:?}", substring.as_ref().unwrap());
215        assert_eq!(substring.unwrap(), "GGGGA");
216
217        println!("Retrieving a substring of sequence named: {:?}", digest_str);
218        let start = 3;
219        let end = start + 2;
220        let substring = store.get_substring(digest, start, end);
221        assert!(
222            substring.is_ok(),
223            "Failed to retrieve substring with name: {:?}",
224            digest_str
225        );
226        println!("Retrieved substring: {:?}", substring.as_ref().unwrap());
227        assert_eq!(substring.unwrap(), "GG");
228    }
229}