gtars_refget/lib.rs
1//! # Rust implementation of GA4GH Refget sequence collection functions
2
3//! This module provides functions managing and retrieving sequences from a sequence collection.
4//!
5//! # Functions
6//!
7//! The module includes the following main components:
8//!
9//! * `alphabet.rs` - Defines various sequence alphabets (e.g., DNA, protein, ASCII).
10//! * `collection.rs` - Contains the `SequenceCollection` struct and methods for managing sequence collections.
11//! * `digest.rs` - Implements functions for calculating and verifying sha512t24u and other digests.
12//! * `encoder.rs` - Contains functions for encoding sequences into compact representations.
13//! * `fasta.rs` - Provides functions for reading and writing FASTA files.
14//! * `store.rs` - Implements a sequence store that allows for efficient storage and retrieval of sequences indexed by sha512t24u digest.
15pub mod alphabet;
16pub mod collection;
17pub mod digest;
18pub mod encoder;
19pub mod fasta;
20pub mod store;
21
22// Used internally to make it easy to convert types to a 32-byte key for hash tables
23mod hashkeyable;
24mod utils;
25
26#[cfg(test)]
27mod tests {
28 use super::*;
29
30 // #[test]
31 // fn test_sequence_retrieval_performance() {
32 // // Create a new sequence store
33 // let mut store = GlobalRefgetStore::new(StorageMode::Encoded);
34
35 // // Add a variety of sequences
36 // let sequences = vec![
37 // ("seq1", b"ACGTACGT".to_vec()),
38 // ("seq2", b"TGCATGCA".to_vec()),
39 // ("seq3", b"NNNNRRYY".to_vec()),
40 // ("seq4", b"ACTGACTG".to_vec()),
41 // ];
42
43 // // Add sequences to store
44 // for (name, seq) in &sequences {
45 // println!("Add sequence with name: {}", name);
46 // store.add_sequence(name, seq, None);
47 // let retrieved = store.get_sequence(&name).unwrap();
48 // println!("Retrieved sequence: {:?}", retrieved);
49 // }
50 // println!("{}", store);
51
52 // // Test retrieving subsequences in a loop
53 // for _ in 0..1000 {
54 // for (name, seq) in &sequences {
55 // // Get the sha512t24u digest
56 // let sha512_digest = sha512t24u(seq);
57 // let md5_digest = md5(seq);
58
59 // // Try retrieving by all three methods
60 // for key in &[&sha512_digest, &md5_digest, *name] {
61 // // Get full sequence
62 // // println!("Getting full sequence for: {}", key);
63 // let stored_sequence = store.get_sequence(key).unwrap();
64 // // println!("Stored sequence: {:?}", stored_sequence);
65 // assert_eq!(stored_sequence.len(), seq.len());
66
67 // // Get various substrings
68 // for start in 0..seq.len() {
69 // for end in (start + 1)..=seq.len() {
70 // let substring = store.get_substring(key, start, end).unwrap();
71 // assert_eq!(substring.len(), end - start);
72 // }
73 // }
74 // }
75 // }
76 // }
77 // }
78
79 use std::time::Instant;
80 use store::GlobalRefgetStore;
81 use store::StorageMode;
82 use tempfile::tempdir;
83 #[test]
84 #[ignore]
85 fn test_loading_large_fasta_file() {
86 // Path to a large FASTA file
87 // let fasta_path = "GRCh38_full_analysis_set_plus_decoy_hla.fa";
88 let fasta_path =
89 std::env::var("FASTA_PATH").expect("FASTA_PATH environment variable not set");
90 // let fasta_path = "../tests/data/subset.fa.gz";
91 // let fasta_path = "../tests/data/fasta/base.fa.gz";
92 println!("Loading large FASTA file: {}", &fasta_path);
93
94 // Create a new sequence store, and dd sequences to the store
95 println!("Adding sequences from FASTA file...");
96 let start = Instant::now();
97 let mut store = GlobalRefgetStore::new(StorageMode::Encoded);
98 store.import_fasta(&fasta_path).unwrap();
99 let duration = start.elapsed();
100 println!("⏱️ Time taken to load: {:.2?}", duration);
101
102 let mut store2 = GlobalRefgetStore::new(StorageMode::Raw);
103 store2.import_fasta(&fasta_path).unwrap();
104
105 // Get list of sequences
106 let sequences = store.list_sequence_digests();
107 assert!(!sequences.is_empty(), "No sequences found in the store");
108
109 // Look up the first sequence by digest
110 println!("Look up a sequence by digest...");
111 let digest = &sequences[0];
112 let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data");
113 // let seq = store.get_sequence(name);
114 // assert!(seq.is_some(), "Failed to retrieve sequence with name: {}", name);
115 // println!("Retrieved sequence: {:?}", seq.unwrap());
116
117 // Test retrieval of a substring
118 println!("Retrieving a substring of sequence named: {:?}", digest_str);
119 let start_basic = 0;
120 let end_basic = 3;
121 let substring = store.get_substring(digest, start_basic, end_basic);
122 assert!(
123 substring.is_some(),
124 "Failed to retrieve substring with name: {:?}",
125 digest_str
126 );
127 println!("Retrieved substring: {:?}", substring.unwrap());
128
129 // Retrieve substring via digest
130 let start = 148 * 70;
131 let end = 148 * 70 + 70;
132 let substring2 = store.get_substring(digest, start, end);
133 assert!(
134 substring2.is_some(),
135 "Failed to retrieve substring with name: {:?}",
136 digest_str
137 );
138
139 let substring3 = store2.get_substring(digest, start, end);
140 assert!(substring2 == substring3);
141 println!("Retrieved substring: {:?}", substring2.unwrap());
142 println!("Retrieved substring: {:?}", substring3.unwrap());
143 }
144
145 #[test]
146 fn test_get_sequence_encoded() {
147 let temp_dir = tempdir().expect("Failed to create temporary directory");
148 let temp_path = temp_dir.path();
149 // Create a new sequence store
150 let mut store = GlobalRefgetStore::new(StorageMode::Encoded);
151 // let fasta_path = "../tests/data/subset.fa.gz";
152 let fasta_path = "../tests/data/fasta/base.fa.gz";
153 let temp_fasta = temp_path.join("base.fa.gz");
154 std::fs::copy(fasta_path, &temp_fasta).expect("Failed to copy base.fa.gz to tempdir");
155
156 // Add sequences to the store
157 store.import_fasta(temp_fasta).unwrap();
158 println!("Listing sequences in the store...");
159 // let sequences = store.list_sequence_digests();
160 // let digest = &sequences[0];
161 // let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data");
162 // let digest = "Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"; // from subset.fa.gz
163 let digest = "iYtREV555dUFKg2_agSJW6suquUyPpMw"; // from base.fa.gz
164 let digest_str = String::from_utf8(digest.as_bytes().to_vec()).expect("Invalid ASCII data");
165
166 // Test retrieval of a substring
167 println!("Retrieving a substring of sequence named: {:?}", digest_str);
168 let start = 2;
169 let end = start + 5;
170 let substring = store.get_substring(digest, start, end);
171 assert!(
172 substring.is_some(),
173 "Failed to retrieve substring with name: {:?}",
174 digest_str
175 );
176 println!("Retrieved substring: {:?}", substring.clone().unwrap());
177 assert!(substring.unwrap() == "GGGGA");
178 // assert!(substring.unwrap() == "CCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCC");
179
180 println!("Retrieving a substring of sequence named: {:?}", digest_str);
181 let start = 3;
182 let end = start + 2;
183 let substring = store.get_substring(digest, start, end);
184 assert!(
185 substring.is_some(),
186 "Failed to retrieve substring with name: {:?}",
187 digest_str
188 );
189 println!("Retrieved substring: {:?}", substring.clone().unwrap());
190 assert!(substring.unwrap() == "GG");
191 // assert!(substring.unwrap() == "TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGACAACGCAGCTC");
192 }
193}