Skip to main content

gtars_refget/
collection.rs

1//! Extended SequenceCollection with filesystem operations.
2//!
3//! This module adds file I/O and caching methods to the core types
4//! defined in `crate::digest::types`.
5
6use anyhow::Result;
7use std::io::Write;
8use std::path::Path;
9
10use crate::fasta::digest_fasta;
11use crate::utils::PathExtension;
12
13// Re-export core types from digest::types for backward compatibility
14pub use crate::digest::types::{
15    FaiMetadata, SeqColDigestLvl1, SequenceCollection, SequenceCollectionMetadata,
16    SequenceCollectionRecord, SequenceMetadata, SequenceRecord, digest_sequence,
17    digest_sequence_with_description, parse_rgsi_line,
18};
19
20/// Shared implementation for writing RGSI (Refget Sequence Index) files.
21fn write_rgsi_impl<P: AsRef<Path>>(
22    file_path: P,
23    metadata: &SequenceCollectionMetadata,
24    sequences: Option<&[SequenceRecord]>,
25) -> Result<()> {
26    let file_path = file_path.as_ref();
27    let mut file = std::fs::File::create(file_path)?;
28
29    // Write collection digest headers
30    writeln!(file, "##seqcol_digest={}", metadata.digest)?;
31    writeln!(file, "##names_digest={}", metadata.names_digest)?;
32    writeln!(file, "##sequences_digest={}", metadata.sequences_digest)?;
33    writeln!(file, "##lengths_digest={}", metadata.lengths_digest)?;
34    if let Some(ref nlp) = metadata.name_length_pairs_digest {
35        writeln!(file, "##name_length_pairs_digest={}", nlp)?;
36    }
37    if let Some(ref snlp) = metadata.sorted_name_length_pairs_digest {
38        writeln!(file, "##sorted_name_length_pairs_digest={}", snlp)?;
39    }
40    if let Some(ref ss) = metadata.sorted_sequences_digest {
41        writeln!(file, "##sorted_sequences_digest={}", ss)?;
42    }
43    writeln!(
44        file,
45        "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
46    )?;
47
48    // Write sequence metadata if available
49    if let Some(seqs) = sequences {
50        for seq_record in seqs {
51            let seq_meta = seq_record.metadata();
52            writeln!(
53                file,
54                "{}\t{}\t{}\t{}\t{}\t{}",
55                seq_meta.name,
56                seq_meta.length,
57                seq_meta.alphabet,
58                seq_meta.sha512t24u,
59                seq_meta.md5,
60                seq_meta.description.as_deref().unwrap_or("")
61            )?;
62        }
63    }
64    Ok(())
65}
66
67// ============================================================================
68// Extensions to SequenceMetadata for filesystem operations
69// ============================================================================
70
71/// Extension trait for SequenceMetadata with filesystem-dependent methods.
72pub trait SequenceMetadataExt {
73    fn disk_size(&self, mode: &crate::store::StorageMode) -> usize;
74}
75
76impl SequenceMetadataExt for SequenceMetadata {
77    /// Calculate the disk size in bytes for this sequence.
78    fn disk_size(&self, mode: &crate::store::StorageMode) -> usize {
79        match mode {
80            crate::store::StorageMode::Raw => self.length,
81            crate::store::StorageMode::Encoded => {
82                let bits_per_symbol = self.alphabet.bits_per_symbol();
83                let total_bits = self.length * bits_per_symbol;
84                total_bits.div_ceil(8)
85            }
86        }
87    }
88}
89
90// ============================================================================
91// Extensions to SequenceRecord for filesystem operations
92// ============================================================================
93
94/// Extension trait for SequenceRecord with filesystem-dependent methods.
95pub trait SequenceRecordExt {
96    fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()>;
97}
98
99impl SequenceRecordExt for SequenceRecord {
100    /// Write a single sequence to a file.
101    fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
102        use std::fs::{self, File};
103        use std::io::Write;
104
105        let data = match self {
106            SequenceRecord::Stub(_) => {
107                return Err(anyhow::anyhow!(
108                    "Cannot write file: sequence data not loaded"
109                ));
110            }
111            SequenceRecord::Full { sequence, .. } => sequence,
112        };
113
114        // Create parent directories if they don't exist
115        if let Some(parent) = path.as_ref().parent() {
116            fs::create_dir_all(parent)?;
117        }
118
119        let mut file = File::create(path)?;
120        file.write_all(data)?;
121        Ok(())
122    }
123}
124
125// ============================================================================
126// Extensions to SequenceCollectionRecord for filesystem operations
127// ============================================================================
128
129/// Extension trait for SequenceCollectionRecord with filesystem-dependent methods.
130pub trait SequenceCollectionRecordExt {
131    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
132}
133
134impl SequenceCollectionRecordExt for SequenceCollectionRecord {
135    /// Write the collection to an RGSI file.
136    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
137        write_rgsi_impl(file_path, self.metadata(), self.sequences())
138    }
139}
140
141// ============================================================================
142// Extensions to SequenceCollection for filesystem operations
143// ============================================================================
144
145/// Extension trait for SequenceCollection with filesystem-dependent methods.
146pub trait SequenceCollectionExt {
147    fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
148    fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
149    fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
150    fn from_path_with_cache<P: AsRef<Path>>(
151        file_path: P,
152        read_cache: bool,
153        write_cache: bool,
154    ) -> Result<SequenceCollection>;
155    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
156    fn write_rgsi(&self) -> Result<()>;
157    fn to_record(&self) -> SequenceCollectionRecord;
158    fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()>;
159}
160
161impl SequenceCollectionExt for SequenceCollection {
162    /// Default behavior: read and write cache
163    fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
164        Self::from_path_with_cache(file_path, true, true)
165    }
166
167    fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
168        let rgsi_file_path = file_path.as_ref().replace_exts_with("rgsi");
169
170        if rgsi_file_path.exists() {
171            read_rgsi_file(&rgsi_file_path)
172        } else {
173            Err(anyhow::anyhow!(
174                "RGSI file does not exist at {:?}",
175                rgsi_file_path
176            ))
177        }
178    }
179
180    /// No caching at all
181    fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
182        Self::from_path_with_cache(file_path, false, false)
183    }
184
185    fn from_path_with_cache<P: AsRef<Path>>(
186        file_path: P,
187        read_cache: bool,
188        write_cache: bool,
189    ) -> Result<SequenceCollection> {
190        let fa_file_path = file_path.as_ref();
191        let rgsi_file_path = fa_file_path.replace_exts_with("rgsi");
192
193        // Check if the cache file exists and is valid
194        if read_cache && rgsi_file_path.exists() {
195            let seqcol = read_rgsi_file(&rgsi_file_path)?;
196            if !seqcol.sequences.is_empty() {
197                return Ok(seqcol);
198            }
199            // Cache is empty/stale - fall through to re-digest
200            let _ = std::fs::remove_file(&rgsi_file_path);
201        }
202
203        // Digest the fasta file
204        let seqcol: SequenceCollection = digest_fasta(file_path.as_ref())?;
205
206        // Write the SequenceCollection to the RGSI file
207        if write_cache && !rgsi_file_path.exists() {
208            seqcol.write_rgsi()?;
209        }
210        Ok(seqcol)
211    }
212
213    /// Write the SequenceCollection to an RGSI file.
214    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
215        write_rgsi_impl(file_path, &self.metadata, Some(&self.sequences))
216    }
217
218    /// Write the SequenceCollection to an RGSI file using the stored file path.
219    fn write_rgsi(&self) -> Result<()> {
220        if let Some(file_path) = &self.metadata.file_path {
221            let rgsi_file_path = file_path.replace_exts_with("rgsi");
222            self.write_collection_rgsi(rgsi_file_path)
223        } else {
224            Err(anyhow::anyhow!(
225                "No file path specified for RGSI output. Use `write_collection_rgsi` to specify a file path."
226            ))
227        }
228    }
229
230    /// Convert to a SequenceCollectionRecord for storage.
231    fn to_record(&self) -> SequenceCollectionRecord {
232        SequenceCollectionRecord::from(self.clone())
233    }
234
235    /// Write the SequenceCollection to a FASTA file.
236    fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()> {
237        use std::fs::File;
238
239        let line_width = line_width.unwrap_or(70);
240        let mut output_file = File::create(file_path)?;
241
242        for record in &self.sequences {
243            if !record.is_loaded() {
244                return Err(anyhow::anyhow!(
245                    "Cannot write FASTA: sequence '{}' does not have data loaded",
246                    record.metadata().name
247                ));
248            }
249
250            let decoded_sequence = record.decode().ok_or_else(|| {
251                anyhow::anyhow!("Failed to decode sequence '{}'", record.metadata().name)
252            })?;
253
254            // Write FASTA header
255            let metadata = record.metadata();
256            let header = match &metadata.description {
257                Some(desc) => format!(">{} {}", metadata.name, desc),
258                None => format!(">{}", metadata.name),
259            };
260            writeln!(output_file, "{}", header)?;
261
262            // Write sequence with line wrapping
263            for chunk in decoded_sequence.as_bytes().chunks(line_width) {
264                output_file.write_all(chunk)?;
265                output_file.write_all(b"\n")?;
266            }
267        }
268
269        Ok(())
270    }
271}
272
273// ============================================================================
274// RGSI File I/O
275// ============================================================================
276
277/// Read an RGSI file and return a SequenceCollection.
278pub fn read_rgsi_file<T: AsRef<Path>>(file_path: T) -> Result<SequenceCollection> {
279    use std::io::BufRead;
280
281    let file = std::fs::File::open(&file_path)?;
282    let reader = std::io::BufReader::new(file);
283    let mut results = Vec::new();
284
285    // Variables to store header metadata
286    let mut seqcol_digest = String::new();
287    let mut names_digest = String::new();
288    let mut sequences_digest = String::new();
289    let mut lengths_digest = String::new();
290    let mut name_length_pairs_digest: Option<String> = None;
291    let mut sorted_name_length_pairs_digest: Option<String> = None;
292    let mut sorted_sequences_digest: Option<String> = None;
293    for line in reader.lines() {
294        let line = line?;
295
296        // Parse header metadata lines
297        if line.starts_with("##") {
298            if let Some(stripped) = line.strip_prefix("##") {
299                if let Some((key, value)) = stripped.split_once('=') {
300                    match key {
301                        "seqcol_digest" => seqcol_digest = value.to_string(),
302                        "names_digest" => names_digest = value.to_string(),
303                        "sequences_digest" => sequences_digest = value.to_string(),
304                        "lengths_digest" => lengths_digest = value.to_string(),
305                        "name_length_pairs_digest" => name_length_pairs_digest = Some(value.to_string()),
306                        "sorted_name_length_pairs_digest" => sorted_name_length_pairs_digest = Some(value.to_string()),
307                        "sorted_sequences_digest" => sorted_sequences_digest = Some(value.to_string()),
308                        _ => {} // Ignore unknown metadata keys
309                    }
310                }
311            }
312            continue;
313        }
314
315        // Skip other comment lines
316        if line.starts_with('#') {
317            continue;
318        }
319
320        // Parse sequence data line
321        if let Some(metadata) = parse_rgsi_line(&line) {
322            results.push(SequenceRecord::Stub(metadata));
323        }
324    }
325
326    // Build collection metadata
327    let collection_metadata = if sequences_digest.is_empty()
328        || names_digest.is_empty()
329        || lengths_digest.is_empty()
330    {
331        // Compute from sequence records
332        SequenceCollectionMetadata::from_sequences(&results, Some(file_path.as_ref().to_path_buf()))
333    } else {
334        // Use digests from file header
335        let lvl1 = SeqColDigestLvl1 {
336            sequences_digest,
337            names_digest,
338            lengths_digest,
339        };
340
341        let digest = if seqcol_digest.is_empty() {
342            lvl1.to_digest()
343        } else {
344            seqcol_digest
345        };
346
347        SequenceCollectionMetadata {
348            digest,
349            n_sequences: results.len(),
350            names_digest: lvl1.names_digest,
351            sequences_digest: lvl1.sequences_digest,
352            lengths_digest: lvl1.lengths_digest,
353            name_length_pairs_digest,
354            sorted_name_length_pairs_digest,
355            sorted_sequences_digest,
356            file_path: Some(file_path.as_ref().to_path_buf()),
357        }
358    };
359
360    Ok(SequenceCollection {
361        metadata: collection_metadata,
362        sequences: results,
363    })
364}
365
366// ============================================================================
367// Tests
368// ============================================================================
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use crate::digest::encode_sequence;
374    use crate::digest::{AlphabetType, lookup_alphabet};
375    use crate::fasta::{digest_fasta, load_fasta};
376
377    #[test]
378    fn test_decode_returns_none_when_no_data() {
379        let seqcol =
380            digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
381
382        for seq_record in &seqcol.sequences {
383            assert!(!seq_record.is_loaded());
384            assert_eq!(seq_record.decode(), None);
385        }
386    }
387
388    #[test]
389    fn test_decode_with_loaded_data() {
390        let seqcol =
391            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
392
393        let expected_sequences = vec![("chrX", "TTGGGGAA"), ("chr1", "GGAA"), ("chr2", "GCGC")];
394
395        for (seq_record, (expected_name, expected_seq)) in
396            seqcol.sequences.iter().zip(expected_sequences.iter())
397        {
398            assert_eq!(seq_record.metadata().name, *expected_name);
399            assert!(seq_record.is_loaded());
400            let decoded = seq_record.decode().expect("decode() should return Some");
401            assert_eq!(decoded, *expected_seq);
402        }
403    }
404
405    #[test]
406    fn test_decode_handles_encoded_data() {
407        let sequence = b"ACGT";
408        let alphabet = lookup_alphabet(&AlphabetType::Dna2bit);
409        let encoded_data = encode_sequence(sequence, alphabet);
410
411        let record = SequenceRecord::Full {
412            metadata: SequenceMetadata {
413                name: "test_seq".to_string(),
414                description: None,
415                length: 4,
416                sha512t24u: "test_digest".to_string(),
417                md5: "test_md5".to_string(),
418                alphabet: AlphabetType::Dna2bit,
419                fai: None,
420            },
421            sequence: encoded_data,
422        };
423
424        let decoded = record.decode().expect("Should decode encoded data");
425        assert_eq!(decoded, "ACGT");
426    }
427
428    #[test]
429    fn test_sequence_collection_from_fasta() {
430        let seqcol = SequenceCollection::from_path_no_cache("../tests/data/fasta/base.fa")
431            .expect("Failed to load FASTA");
432
433        assert_eq!(seqcol.sequences.len(), 3);
434        assert!(!seqcol.metadata.digest.is_empty());
435    }
436
437    #[test]
438    fn test_sequence_collection_write_and_read_rgsi() {
439        use tempfile::tempdir;
440
441        let seqcol =
442            digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
443
444        let dir = tempdir().expect("Failed to create temp dir");
445        let rgsi_path = dir.path().join("test.rgsi");
446
447        seqcol
448            .write_collection_rgsi(&rgsi_path)
449            .expect("Failed to write RGSI");
450
451        let loaded = read_rgsi_file(&rgsi_path).expect("Failed to read RGSI");
452
453        assert_eq!(loaded.metadata.digest, seqcol.metadata.digest);
454        assert_eq!(loaded.sequences.len(), seqcol.sequences.len());
455    }
456
457    #[test]
458    fn test_sequence_collection_write_fasta() {
459        use std::fs;
460        use tempfile::tempdir;
461
462        let seqcol =
463            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
464
465        let dir = tempdir().expect("Failed to create temp dir");
466        let fasta_path = dir.path().join("output.fa");
467
468        seqcol
469            .write_fasta(&fasta_path, None)
470            .expect("Failed to write FASTA");
471
472        let content = fs::read_to_string(&fasta_path).expect("Failed to read file");
473        assert!(content.contains(">chrX"));
474        assert!(content.contains("TTGGGGAA"));
475    }
476
477    #[test]
478    fn test_digest_sequence_basic() {
479        let seq = digest_sequence("test_seq", b"ACGTACGT");
480        assert_eq!(seq.metadata().name, "test_seq");
481        assert_eq!(seq.metadata().length, 8);
482        assert!(!seq.metadata().sha512t24u.is_empty());
483        assert!(seq.is_loaded());
484        assert_eq!(seq.sequence().unwrap(), b"ACGTACGT");
485    }
486
487    #[test]
488    fn test_digest_sequence_matches_fasta_digest() {
489        let seqcol =
490            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
491        let fasta_seq = &seqcol.sequences[0]; // chrX: TTGGGGAA
492
493        let prog_seq = digest_sequence("chrX", b"TTGGGGAA");
494
495        assert_eq!(
496            prog_seq.metadata().sha512t24u,
497            fasta_seq.metadata().sha512t24u
498        );
499        assert_eq!(prog_seq.metadata().md5, fasta_seq.metadata().md5);
500    }
501
502    #[test]
503    fn test_sequence_metadata_disk_size() {
504        use crate::store::StorageMode;
505
506        let metadata = SequenceMetadata {
507            name: "test".to_string(),
508            description: None,
509            length: 1000,
510            sha512t24u: "test".to_string(),
511            md5: "test".to_string(),
512            alphabet: AlphabetType::Dna2bit,
513            fai: None,
514        };
515
516        assert_eq!(metadata.disk_size(&StorageMode::Raw), 1000);
517        assert_eq!(metadata.disk_size(&StorageMode::Encoded), 250);
518    }
519
520    #[test]
521    fn test_sequence_record_to_file() {
522        use std::fs;
523        use tempfile::tempdir;
524
525        let seqcol =
526            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
527
528        let dir = tempdir().expect("Failed to create temp dir");
529        let file_path = dir.path().join("test_seq.txt");
530
531        seqcol.sequences[0]
532            .to_file(&file_path)
533            .expect("Failed to write file");
534
535        let content = fs::read(&file_path).expect("Failed to read file");
536        assert!(!content.is_empty());
537    }
538
539    #[test]
540    fn test_rgsi_round_trip_ancillary_digests() {
541        use tempfile::tempdir;
542
543        let mut seqcol =
544            digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
545
546        // Compute ancillary digests before writing
547        seqcol
548            .metadata
549            .compute_ancillary_digests(&seqcol.sequences);
550        assert!(seqcol.metadata.name_length_pairs_digest.is_some());
551        assert!(seqcol.metadata.sorted_name_length_pairs_digest.is_some());
552        assert!(seqcol.metadata.sorted_sequences_digest.is_some());
553
554        // Write RGSI
555        let dir = tempdir().expect("Failed to create temp dir");
556        let rgsi_path = dir.path().join("test_ancillary.rgsi");
557        seqcol
558            .write_collection_rgsi(&rgsi_path)
559            .expect("Failed to write RGSI");
560
561        // Read it back
562        let loaded = read_rgsi_file(&rgsi_path).expect("Failed to read RGSI");
563
564        // Ancillary digests should be preserved
565        assert_eq!(
566            loaded.metadata.name_length_pairs_digest,
567            seqcol.metadata.name_length_pairs_digest
568        );
569        assert_eq!(
570            loaded.metadata.sorted_name_length_pairs_digest,
571            seqcol.metadata.sorted_name_length_pairs_digest
572        );
573        assert_eq!(
574            loaded.metadata.sorted_sequences_digest,
575            seqcol.metadata.sorted_sequences_digest
576        );
577    }
578}