Skip to main content

gtars_refget/
collection.rs

1//! Extended SequenceCollection with filesystem operations.
2//!
3//! This module adds file I/O and caching methods to the core types
4//! defined in `crate::digest::types`.
5
6use anyhow::Result;
7use std::io::Write;
8use std::path::Path;
9
10use crate::fasta::digest_fasta;
11use crate::utils::PathExtension;
12
13// Re-export core types from digest::types for backward compatibility
14pub use crate::digest::types::{
15    FaiMetadata, SeqColDigestLvl1, SequenceCollection, SequenceCollectionMetadata,
16    SequenceCollectionRecord, SequenceMetadata, SequenceRecord, digest_sequence,
17    digest_sequence_with_description, parse_rgsi_line,
18};
19
20/// Shared implementation for writing RGSI (Refget Sequence Index) files.
21fn write_rgsi_impl<P: AsRef<Path>>(
22    file_path: P,
23    metadata: &SequenceCollectionMetadata,
24    sequences: Option<&[SequenceRecord]>,
25) -> Result<()> {
26    let file_path = file_path.as_ref();
27    let mut file = std::fs::File::create(file_path)?;
28
29    // Write collection digest headers
30    writeln!(file, "##seqcol_digest={}", metadata.digest)?;
31    writeln!(file, "##names_digest={}", metadata.names_digest)?;
32    writeln!(file, "##sequences_digest={}", metadata.sequences_digest)?;
33    writeln!(file, "##lengths_digest={}", metadata.lengths_digest)?;
34    writeln!(
35        file,
36        "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
37    )?;
38
39    // Write sequence metadata if available
40    if let Some(seqs) = sequences {
41        for seq_record in seqs {
42            let seq_meta = seq_record.metadata();
43            writeln!(
44                file,
45                "{}\t{}\t{}\t{}\t{}\t{}",
46                seq_meta.name,
47                seq_meta.length,
48                seq_meta.alphabet,
49                seq_meta.sha512t24u,
50                seq_meta.md5,
51                seq_meta.description.as_deref().unwrap_or("")
52            )?;
53        }
54    }
55    Ok(())
56}
57
58// ============================================================================
59// Extensions to SequenceMetadata for filesystem operations
60// ============================================================================
61
62/// Extension trait for SequenceMetadata with filesystem-dependent methods.
63pub trait SequenceMetadataExt {
64    fn disk_size(&self, mode: &crate::store::StorageMode) -> usize;
65}
66
67impl SequenceMetadataExt for SequenceMetadata {
68    /// Calculate the disk size in bytes for this sequence.
69    fn disk_size(&self, mode: &crate::store::StorageMode) -> usize {
70        match mode {
71            crate::store::StorageMode::Raw => self.length,
72            crate::store::StorageMode::Encoded => {
73                let bits_per_symbol = self.alphabet.bits_per_symbol();
74                let total_bits = self.length * bits_per_symbol;
75                total_bits.div_ceil(8)
76            }
77        }
78    }
79}
80
81// ============================================================================
82// Extensions to SequenceRecord for filesystem operations
83// ============================================================================
84
85/// Extension trait for SequenceRecord with filesystem-dependent methods.
86pub trait SequenceRecordExt {
87    fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()>;
88}
89
90impl SequenceRecordExt for SequenceRecord {
91    /// Write a single sequence to a file.
92    fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
93        use std::fs::{self, File};
94        use std::io::Write;
95
96        let data = match self {
97            SequenceRecord::Stub(_) => {
98                return Err(anyhow::anyhow!(
99                    "Cannot write file: sequence data not loaded"
100                ));
101            }
102            SequenceRecord::Full { sequence, .. } => sequence,
103        };
104
105        // Create parent directories if they don't exist
106        if let Some(parent) = path.as_ref().parent() {
107            fs::create_dir_all(parent)?;
108        }
109
110        let mut file = File::create(path)?;
111        file.write_all(data)?;
112        Ok(())
113    }
114}
115
116// ============================================================================
117// Extensions to SequenceCollectionRecord for filesystem operations
118// ============================================================================
119
120/// Extension trait for SequenceCollectionRecord with filesystem-dependent methods.
121pub trait SequenceCollectionRecordExt {
122    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
123}
124
125impl SequenceCollectionRecordExt for SequenceCollectionRecord {
126    /// Write the collection to an RGSI file.
127    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
128        write_rgsi_impl(file_path, self.metadata(), self.sequences())
129    }
130}
131
132// ============================================================================
133// Extensions to SequenceCollection for filesystem operations
134// ============================================================================
135
136/// Extension trait for SequenceCollection with filesystem-dependent methods.
137pub trait SequenceCollectionExt {
138    fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
139    fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
140    fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
141    fn from_path_with_cache<P: AsRef<Path>>(
142        file_path: P,
143        read_cache: bool,
144        write_cache: bool,
145    ) -> Result<SequenceCollection>;
146    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
147    fn write_rgsi(&self) -> Result<()>;
148    fn to_record(&self) -> SequenceCollectionRecord;
149    fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()>;
150}
151
152impl SequenceCollectionExt for SequenceCollection {
153    /// Default behavior: read and write cache
154    fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
155        Self::from_path_with_cache(file_path, true, true)
156    }
157
158    fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
159        let rgsi_file_path = file_path.as_ref().replace_exts_with("rgsi");
160
161        if rgsi_file_path.exists() {
162            read_rgsi_file(&rgsi_file_path)
163        } else {
164            Err(anyhow::anyhow!(
165                "RGSI file does not exist at {:?}",
166                rgsi_file_path
167            ))
168        }
169    }
170
171    /// No caching at all
172    fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
173        Self::from_path_with_cache(file_path, false, false)
174    }
175
176    fn from_path_with_cache<P: AsRef<Path>>(
177        file_path: P,
178        read_cache: bool,
179        write_cache: bool,
180    ) -> Result<SequenceCollection> {
181        let fa_file_path = file_path.as_ref();
182        let rgsi_file_path = fa_file_path.replace_exts_with("rgsi");
183
184        // Check if the cache file exists and is valid
185        if read_cache && rgsi_file_path.exists() {
186            let seqcol = read_rgsi_file(&rgsi_file_path)?;
187            if !seqcol.sequences.is_empty() {
188                return Ok(seqcol);
189            }
190            // Cache is empty/stale - fall through to re-digest
191            let _ = std::fs::remove_file(&rgsi_file_path);
192        }
193
194        // Digest the fasta file
195        let seqcol: SequenceCollection = digest_fasta(file_path.as_ref())?;
196
197        // Write the SequenceCollection to the RGSI file
198        if write_cache && !rgsi_file_path.exists() {
199            seqcol.write_rgsi()?;
200        }
201        Ok(seqcol)
202    }
203
204    /// Write the SequenceCollection to an RGSI file.
205    fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
206        write_rgsi_impl(file_path, &self.metadata, Some(&self.sequences))
207    }
208
209    /// Write the SequenceCollection to an RGSI file using the stored file path.
210    fn write_rgsi(&self) -> Result<()> {
211        if let Some(file_path) = &self.metadata.file_path {
212            let rgsi_file_path = file_path.replace_exts_with("rgsi");
213            self.write_collection_rgsi(rgsi_file_path)
214        } else {
215            Err(anyhow::anyhow!(
216                "No file path specified for RGSI output. Use `write_collection_rgsi` to specify a file path."
217            ))
218        }
219    }
220
221    /// Convert to a SequenceCollectionRecord for storage.
222    fn to_record(&self) -> SequenceCollectionRecord {
223        SequenceCollectionRecord::from(self.clone())
224    }
225
226    /// Write the SequenceCollection to a FASTA file.
227    fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()> {
228        use std::fs::File;
229
230        let line_width = line_width.unwrap_or(70);
231        let mut output_file = File::create(file_path)?;
232
233        for record in &self.sequences {
234            if !record.is_loaded() {
235                return Err(anyhow::anyhow!(
236                    "Cannot write FASTA: sequence '{}' does not have data loaded",
237                    record.metadata().name
238                ));
239            }
240
241            let decoded_sequence = record.decode().ok_or_else(|| {
242                anyhow::anyhow!("Failed to decode sequence '{}'", record.metadata().name)
243            })?;
244
245            // Write FASTA header
246            let metadata = record.metadata();
247            let header = match &metadata.description {
248                Some(desc) => format!(">{} {}", metadata.name, desc),
249                None => format!(">{}", metadata.name),
250            };
251            writeln!(output_file, "{}", header)?;
252
253            // Write sequence with line wrapping
254            for chunk in decoded_sequence.as_bytes().chunks(line_width) {
255                output_file.write_all(chunk)?;
256                output_file.write_all(b"\n")?;
257            }
258        }
259
260        Ok(())
261    }
262}
263
264// ============================================================================
265// RGSI File I/O
266// ============================================================================
267
268/// Read an RGSI file and return a SequenceCollection.
269pub fn read_rgsi_file<T: AsRef<Path>>(file_path: T) -> Result<SequenceCollection> {
270    use std::io::BufRead;
271
272    let file = std::fs::File::open(&file_path)?;
273    let reader = std::io::BufReader::new(file);
274    let mut results = Vec::new();
275
276    // Variables to store header metadata
277    let mut seqcol_digest = String::new();
278    let mut names_digest = String::new();
279    let mut sequences_digest = String::new();
280    let mut lengths_digest = String::new();
281
282    for line in reader.lines() {
283        let line = line?;
284
285        // Parse header metadata lines
286        if line.starts_with("##") {
287            if let Some(stripped) = line.strip_prefix("##") {
288                if let Some((key, value)) = stripped.split_once('=') {
289                    match key {
290                        "seqcol_digest" => seqcol_digest = value.to_string(),
291                        "names_digest" => names_digest = value.to_string(),
292                        "sequences_digest" => sequences_digest = value.to_string(),
293                        "lengths_digest" => lengths_digest = value.to_string(),
294                        _ => {} // Ignore unknown metadata keys
295                    }
296                }
297            }
298            continue;
299        }
300
301        // Skip other comment lines
302        if line.starts_with('#') {
303            continue;
304        }
305
306        // Parse sequence data line
307        if let Some(metadata) = parse_rgsi_line(&line) {
308            results.push(SequenceRecord::Stub(metadata));
309        }
310    }
311
312    // Build collection metadata
313    let collection_metadata = if sequences_digest.is_empty()
314        || names_digest.is_empty()
315        || lengths_digest.is_empty()
316    {
317        // Compute from sequence records
318        SequenceCollectionMetadata::from_sequences(&results, Some(file_path.as_ref().to_path_buf()))
319    } else {
320        // Use digests from file header
321        let lvl1 = SeqColDigestLvl1 {
322            sequences_digest,
323            names_digest,
324            lengths_digest,
325        };
326
327        let digest = if seqcol_digest.is_empty() {
328            lvl1.to_digest()
329        } else {
330            seqcol_digest
331        };
332
333        SequenceCollectionMetadata {
334            digest,
335            n_sequences: results.len(),
336            names_digest: lvl1.names_digest,
337            sequences_digest: lvl1.sequences_digest,
338            lengths_digest: lvl1.lengths_digest,
339            file_path: Some(file_path.as_ref().to_path_buf()),
340        }
341    };
342
343    Ok(SequenceCollection {
344        metadata: collection_metadata,
345        sequences: results,
346    })
347}
348
349// ============================================================================
350// Tests
351// ============================================================================
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356    use crate::digest::encode_sequence;
357    use crate::digest::{AlphabetType, lookup_alphabet};
358    use crate::fasta::{digest_fasta, load_fasta};
359
360    #[test]
361    fn test_decode_returns_none_when_no_data() {
362        let seqcol =
363            digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
364
365        for seq_record in &seqcol.sequences {
366            assert!(!seq_record.is_loaded());
367            assert_eq!(seq_record.decode(), None);
368        }
369    }
370
371    #[test]
372    fn test_decode_with_loaded_data() {
373        let seqcol =
374            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
375
376        let expected_sequences = vec![("chrX", "TTGGGGAA"), ("chr1", "GGAA"), ("chr2", "GCGC")];
377
378        for (seq_record, (expected_name, expected_seq)) in
379            seqcol.sequences.iter().zip(expected_sequences.iter())
380        {
381            assert_eq!(seq_record.metadata().name, *expected_name);
382            assert!(seq_record.is_loaded());
383            let decoded = seq_record.decode().expect("decode() should return Some");
384            assert_eq!(decoded, *expected_seq);
385        }
386    }
387
388    #[test]
389    fn test_decode_handles_encoded_data() {
390        let sequence = b"ACGT";
391        let alphabet = lookup_alphabet(&AlphabetType::Dna2bit);
392        let encoded_data = encode_sequence(sequence, alphabet);
393
394        let record = SequenceRecord::Full {
395            metadata: SequenceMetadata {
396                name: "test_seq".to_string(),
397                description: None,
398                length: 4,
399                sha512t24u: "test_digest".to_string(),
400                md5: "test_md5".to_string(),
401                alphabet: AlphabetType::Dna2bit,
402                fai: None,
403            },
404            sequence: encoded_data,
405        };
406
407        let decoded = record.decode().expect("Should decode encoded data");
408        assert_eq!(decoded, "ACGT");
409    }
410
411    #[test]
412    fn test_sequence_collection_from_fasta() {
413        let seqcol = SequenceCollection::from_fasta("../tests/data/fasta/base.fa")
414            .expect("Failed to load FASTA");
415
416        assert_eq!(seqcol.sequences.len(), 3);
417        assert!(!seqcol.metadata.digest.is_empty());
418    }
419
420    #[test]
421    fn test_sequence_collection_write_and_read_rgsi() {
422        use tempfile::tempdir;
423
424        let seqcol =
425            digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
426
427        let dir = tempdir().expect("Failed to create temp dir");
428        let rgsi_path = dir.path().join("test.rgsi");
429
430        seqcol
431            .write_collection_rgsi(&rgsi_path)
432            .expect("Failed to write RGSI");
433
434        let loaded = read_rgsi_file(&rgsi_path).expect("Failed to read RGSI");
435
436        assert_eq!(loaded.metadata.digest, seqcol.metadata.digest);
437        assert_eq!(loaded.sequences.len(), seqcol.sequences.len());
438    }
439
440    #[test]
441    fn test_sequence_collection_write_fasta() {
442        use std::fs;
443        use tempfile::tempdir;
444
445        let seqcol =
446            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
447
448        let dir = tempdir().expect("Failed to create temp dir");
449        let fasta_path = dir.path().join("output.fa");
450
451        seqcol
452            .write_fasta(&fasta_path, None)
453            .expect("Failed to write FASTA");
454
455        let content = fs::read_to_string(&fasta_path).expect("Failed to read file");
456        assert!(content.contains(">chrX"));
457        assert!(content.contains("TTGGGGAA"));
458    }
459
460    #[test]
461    fn test_digest_sequence_basic() {
462        let seq = digest_sequence("test_seq", b"ACGTACGT");
463        assert_eq!(seq.metadata().name, "test_seq");
464        assert_eq!(seq.metadata().length, 8);
465        assert!(!seq.metadata().sha512t24u.is_empty());
466        assert!(seq.is_loaded());
467        assert_eq!(seq.sequence().unwrap(), b"ACGTACGT");
468    }
469
470    #[test]
471    fn test_digest_sequence_matches_fasta_digest() {
472        let seqcol =
473            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
474        let fasta_seq = &seqcol.sequences[0]; // chrX: TTGGGGAA
475
476        let prog_seq = digest_sequence("chrX", b"TTGGGGAA");
477
478        assert_eq!(
479            prog_seq.metadata().sha512t24u,
480            fasta_seq.metadata().sha512t24u
481        );
482        assert_eq!(prog_seq.metadata().md5, fasta_seq.metadata().md5);
483    }
484
485    #[test]
486    fn test_sequence_metadata_disk_size() {
487        use crate::store::StorageMode;
488
489        let metadata = SequenceMetadata {
490            name: "test".to_string(),
491            description: None,
492            length: 1000,
493            sha512t24u: "test".to_string(),
494            md5: "test".to_string(),
495            alphabet: AlphabetType::Dna2bit,
496            fai: None,
497        };
498
499        assert_eq!(metadata.disk_size(&StorageMode::Raw), 1000);
500        assert_eq!(metadata.disk_size(&StorageMode::Encoded), 250);
501    }
502
503    #[test]
504    fn test_sequence_record_to_file() {
505        use std::fs;
506        use tempfile::tempdir;
507
508        let seqcol =
509            load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
510
511        let dir = tempdir().expect("Failed to create temp dir");
512        let file_path = dir.path().join("test_seq.txt");
513
514        seqcol.sequences[0]
515            .to_file(&file_path)
516            .expect("Failed to write file");
517
518        let content = fs::read(&file_path).expect("Failed to read file");
519        assert!(!content.is_empty());
520    }
521}