Skip to main content

gtars_refget/
lib.rs

1//! # Rust implementation of GA4GH Refget sequence collection functions
2
3//! This module provides functions managing and retrieving sequences from a sequence collection.
4//!
5//! # Functions
6//!
7//! The module includes the following main components:
8//!
9//! * `alphabet.rs` - Defines various sequence alphabets (e.g., DNA, protein, ASCII).
10//! * `collection.rs` - Contains the `SequenceCollection` struct and methods for managing sequence collections.
11//! * `digest.rs` - Implements functions for calculating and verifying sha512t24u and other digests.
12//! * `encoder.rs` - Contains functions for encoding sequences into compact representations.
13//! * `fasta.rs` - Provides functions for reading and writing FASTA files.
14//! * `store.rs` - Implements a sequence store that allows for efficient storage and retrieval of sequences indexed by sha512t24u digest.
15pub mod alphabet;
16pub mod collection;
17pub mod digest;
18pub mod encoder;
19pub mod fasta;
20pub mod store;
21
22// Used internally to make it easy to convert types to a 32-byte key for hash tables
23mod hashkeyable;
24mod utils;
25
26#[cfg(test)]
27mod tests {
28    use super::*;
29
30    use std::time::Instant;
31    use store::RefgetStore;
32    use tempfile::tempdir;
33    #[test]
34    #[ignore]
35    fn test_loading_large_fasta_file() {
36        // Path to a large FASTA file
37        // let fasta_path = "GRCh38_full_analysis_set_plus_decoy_hla.fa";
38        let fasta_path =
39            std::env::var("FASTA_PATH").expect("FASTA_PATH environment variable not set");
40        // let fasta_path = "../tests/data/subset.fa.gz";
41        // let fasta_path = "../tests/data/fasta/base.fa.gz";
42        println!("Loading large FASTA file: {}", &fasta_path);
43
44        // Create a new sequence store, and dd sequences to the store
45        println!("Adding sequences from FASTA file...");
46        let start = Instant::now();
47        let mut store = RefgetStore::in_memory();
48        store.add_sequence_collection_from_fasta(&fasta_path).unwrap();
49        let duration = start.elapsed();
50        println!("⏱️  Time taken to load: {:.2?}", duration);
51
52        let mut store2 = RefgetStore::in_memory();
53        store2.disable_encoding();  // Switch to Raw mode
54        store2.add_sequence_collection_from_fasta(&fasta_path).unwrap();
55
56        // Get list of sequences
57        let sequences: Vec<_> = store.sequence_digests().collect();
58        assert!(!sequences.is_empty(), "No sequences found in the store");
59
60        // Look up the first sequence by digest
61        println!("Look up a sequence by digest...");
62        let digest = &sequences[0];
63        let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data");
64        // let seq = store.get_sequence(name);
65        // assert!(seq.is_some(), "Failed to retrieve sequence with name: {}", name);
66        // println!("Retrieved sequence: {:?}", seq.unwrap());
67
68        // Test retrieval of a substring
69        println!("Retrieving a substring of sequence named: {:?}", digest_str);
70        let start_basic = 0;
71        let end_basic = 3;
72        let substring = store.get_substring(digest, start_basic, end_basic);
73        assert!(
74            substring.is_ok(),
75            "Failed to retrieve substring with name: {:?}",
76            digest_str
77        );
78        println!("Retrieved substring: {:?}", substring.unwrap());
79
80        // Retrieve substring via digest
81        let start = 148 * 70;
82        let end = 148 * 70 + 70;
83        let substring2 = store.get_substring(digest, start, end);
84        assert!(
85            substring2.is_ok(),
86            "Failed to retrieve substring with name: {:?}",
87            digest_str
88        );
89
90        let substring3 = store2.get_substring(digest, start, end);
91        assert_eq!(substring2.as_ref().unwrap(), substring3.as_ref().unwrap());
92        println!("Retrieved substring: {:?}", substring2.unwrap());
93        println!("Retrieved substring: {:?}", substring3.unwrap());
94    }
95
96    #[test]
97    fn test_get_sequence_encoded() {
98        let temp_dir = tempdir().expect("Failed to create temporary directory");
99        let temp_path = temp_dir.path();
100        // Create a new sequence store
101        let mut store = RefgetStore::in_memory();
102        // let fasta_path = "../tests/data/subset.fa.gz";
103        let fasta_path = "../tests/data/fasta/base.fa.gz";
104        let temp_fasta = temp_path.join("base.fa.gz");
105        std::fs::copy(fasta_path, &temp_fasta).expect("Failed to copy base.fa.gz to tempdir");
106
107        // Add sequences to the store
108        store.add_sequence_collection_from_fasta(temp_fasta).unwrap();
109        println!("Listing sequences in the store...");
110        // let sequences = store.sequence_digests();
111        // let digest = &sequences[0];
112        // let digest_str = String::from_utf8(digest.to_vec()).expect("Invalid ASCII data");
113        // let digest = "Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO";  // from subset.fa.gz
114        let digest = "iYtREV555dUFKg2_agSJW6suquUyPpMw"; // from base.fa.gz
115        let digest_str = String::from_utf8(digest.as_bytes().to_vec()).expect("Invalid ASCII data");
116
117        // Test retrieval of a substring
118        println!("Retrieving a substring of sequence named: {:?}", digest_str);
119        let start = 2;
120        let end = start + 5;
121        let substring = store.get_substring(digest, start, end);
122        assert!(
123            substring.is_ok(),
124            "Failed to retrieve substring with name: {:?}",
125            digest_str
126        );
127        println!("Retrieved substring: {:?}", substring.as_ref().unwrap());
128        assert_eq!(substring.unwrap(), "GGGGA");
129        // assert!(substring.unwrap() == "CCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCC");
130
131        println!("Retrieving a substring of sequence named: {:?}", digest_str);
132        let start = 3;
133        let end = start + 2;
134        let substring = store.get_substring(digest, start, end);
135        assert!(
136            substring.is_ok(),
137            "Failed to retrieve substring with name: {:?}",
138            digest_str
139        );
140        println!("Retrieved substring: {:?}", substring.as_ref().unwrap());
141        assert_eq!(substring.unwrap(), "GG");
142        // assert!(substring.unwrap() == "TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGACAACGCAGCTC");
143    }
144}