1use anyhow::Result;
7use std::io::Write;
8use std::path::Path;
9
10use crate::fasta::digest_fasta;
11use crate::utils::PathExtension;
12
13pub use crate::digest::types::{
15 FaiMetadata, SeqColDigestLvl1, SequenceCollection, SequenceCollectionMetadata,
16 SequenceCollectionRecord, SequenceMetadata, SequenceRecord, digest_sequence,
17 digest_sequence_with_description, parse_rgsi_line,
18};
19
20fn write_rgsi_impl<P: AsRef<Path>>(
22 file_path: P,
23 metadata: &SequenceCollectionMetadata,
24 sequences: Option<&[SequenceRecord]>,
25) -> Result<()> {
26 let file_path = file_path.as_ref();
27 let mut file = std::fs::File::create(file_path)?;
28
29 writeln!(file, "##seqcol_digest={}", metadata.digest)?;
31 writeln!(file, "##names_digest={}", metadata.names_digest)?;
32 writeln!(file, "##sequences_digest={}", metadata.sequences_digest)?;
33 writeln!(file, "##lengths_digest={}", metadata.lengths_digest)?;
34 writeln!(
35 file,
36 "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
37 )?;
38
39 if let Some(seqs) = sequences {
41 for seq_record in seqs {
42 let seq_meta = seq_record.metadata();
43 writeln!(
44 file,
45 "{}\t{}\t{}\t{}\t{}\t{}",
46 seq_meta.name,
47 seq_meta.length,
48 seq_meta.alphabet,
49 seq_meta.sha512t24u,
50 seq_meta.md5,
51 seq_meta.description.as_deref().unwrap_or("")
52 )?;
53 }
54 }
55 Ok(())
56}
57
58pub trait SequenceMetadataExt {
64 fn disk_size(&self, mode: &crate::store::StorageMode) -> usize;
65}
66
67impl SequenceMetadataExt for SequenceMetadata {
68 fn disk_size(&self, mode: &crate::store::StorageMode) -> usize {
70 match mode {
71 crate::store::StorageMode::Raw => self.length,
72 crate::store::StorageMode::Encoded => {
73 let bits_per_symbol = self.alphabet.bits_per_symbol();
74 let total_bits = self.length * bits_per_symbol;
75 total_bits.div_ceil(8)
76 }
77 }
78 }
79}
80
81pub trait SequenceRecordExt {
87 fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()>;
88}
89
90impl SequenceRecordExt for SequenceRecord {
91 fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
93 use std::fs::{self, File};
94 use std::io::Write;
95
96 let data = match self {
97 SequenceRecord::Stub(_) => {
98 return Err(anyhow::anyhow!(
99 "Cannot write file: sequence data not loaded"
100 ));
101 }
102 SequenceRecord::Full { sequence, .. } => sequence,
103 };
104
105 if let Some(parent) = path.as_ref().parent() {
107 fs::create_dir_all(parent)?;
108 }
109
110 let mut file = File::create(path)?;
111 file.write_all(data)?;
112 Ok(())
113 }
114}
115
116pub trait SequenceCollectionRecordExt {
122 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
123}
124
125impl SequenceCollectionRecordExt for SequenceCollectionRecord {
126 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
128 write_rgsi_impl(file_path, self.metadata(), self.sequences())
129 }
130}
131
132pub trait SequenceCollectionExt {
138 fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
139 fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
140 fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
141 fn from_path_with_cache<P: AsRef<Path>>(
142 file_path: P,
143 read_cache: bool,
144 write_cache: bool,
145 ) -> Result<SequenceCollection>;
146 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
147 fn write_rgsi(&self) -> Result<()>;
148 fn to_record(&self) -> SequenceCollectionRecord;
149 fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()>;
150}
151
152impl SequenceCollectionExt for SequenceCollection {
153 fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
155 Self::from_path_with_cache(file_path, true, true)
156 }
157
158 fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
159 let rgsi_file_path = file_path.as_ref().replace_exts_with("rgsi");
160
161 if rgsi_file_path.exists() {
162 read_rgsi_file(&rgsi_file_path)
163 } else {
164 Err(anyhow::anyhow!(
165 "RGSI file does not exist at {:?}",
166 rgsi_file_path
167 ))
168 }
169 }
170
171 fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
173 Self::from_path_with_cache(file_path, false, false)
174 }
175
176 fn from_path_with_cache<P: AsRef<Path>>(
177 file_path: P,
178 read_cache: bool,
179 write_cache: bool,
180 ) -> Result<SequenceCollection> {
181 let fa_file_path = file_path.as_ref();
182 let rgsi_file_path = fa_file_path.replace_exts_with("rgsi");
183
184 if read_cache && rgsi_file_path.exists() {
186 let seqcol = read_rgsi_file(&rgsi_file_path)?;
187 if !seqcol.sequences.is_empty() {
188 return Ok(seqcol);
189 }
190 let _ = std::fs::remove_file(&rgsi_file_path);
192 }
193
194 let seqcol: SequenceCollection = digest_fasta(file_path.as_ref())?;
196
197 if write_cache && !rgsi_file_path.exists() {
199 seqcol.write_rgsi()?;
200 }
201 Ok(seqcol)
202 }
203
204 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
206 write_rgsi_impl(file_path, &self.metadata, Some(&self.sequences))
207 }
208
209 fn write_rgsi(&self) -> Result<()> {
211 if let Some(file_path) = &self.metadata.file_path {
212 let rgsi_file_path = file_path.replace_exts_with("rgsi");
213 self.write_collection_rgsi(rgsi_file_path)
214 } else {
215 Err(anyhow::anyhow!(
216 "No file path specified for RGSI output. Use `write_collection_rgsi` to specify a file path."
217 ))
218 }
219 }
220
221 fn to_record(&self) -> SequenceCollectionRecord {
223 SequenceCollectionRecord::from(self.clone())
224 }
225
226 fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()> {
228 use std::fs::File;
229
230 let line_width = line_width.unwrap_or(70);
231 let mut output_file = File::create(file_path)?;
232
233 for record in &self.sequences {
234 if !record.is_loaded() {
235 return Err(anyhow::anyhow!(
236 "Cannot write FASTA: sequence '{}' does not have data loaded",
237 record.metadata().name
238 ));
239 }
240
241 let decoded_sequence = record.decode().ok_or_else(|| {
242 anyhow::anyhow!("Failed to decode sequence '{}'", record.metadata().name)
243 })?;
244
245 let metadata = record.metadata();
247 let header = match &metadata.description {
248 Some(desc) => format!(">{} {}", metadata.name, desc),
249 None => format!(">{}", metadata.name),
250 };
251 writeln!(output_file, "{}", header)?;
252
253 for chunk in decoded_sequence.as_bytes().chunks(line_width) {
255 output_file.write_all(chunk)?;
256 output_file.write_all(b"\n")?;
257 }
258 }
259
260 Ok(())
261 }
262}
263
264pub fn read_rgsi_file<T: AsRef<Path>>(file_path: T) -> Result<SequenceCollection> {
270 use std::io::BufRead;
271
272 let file = std::fs::File::open(&file_path)?;
273 let reader = std::io::BufReader::new(file);
274 let mut results = Vec::new();
275
276 let mut seqcol_digest = String::new();
278 let mut names_digest = String::new();
279 let mut sequences_digest = String::new();
280 let mut lengths_digest = String::new();
281
282 for line in reader.lines() {
283 let line = line?;
284
285 if line.starts_with("##") {
287 if let Some(stripped) = line.strip_prefix("##") {
288 if let Some((key, value)) = stripped.split_once('=') {
289 match key {
290 "seqcol_digest" => seqcol_digest = value.to_string(),
291 "names_digest" => names_digest = value.to_string(),
292 "sequences_digest" => sequences_digest = value.to_string(),
293 "lengths_digest" => lengths_digest = value.to_string(),
294 _ => {} }
296 }
297 }
298 continue;
299 }
300
301 if line.starts_with('#') {
303 continue;
304 }
305
306 if let Some(metadata) = parse_rgsi_line(&line) {
308 results.push(SequenceRecord::Stub(metadata));
309 }
310 }
311
312 let collection_metadata = if sequences_digest.is_empty()
314 || names_digest.is_empty()
315 || lengths_digest.is_empty()
316 {
317 SequenceCollectionMetadata::from_sequences(&results, Some(file_path.as_ref().to_path_buf()))
319 } else {
320 let lvl1 = SeqColDigestLvl1 {
322 sequences_digest,
323 names_digest,
324 lengths_digest,
325 };
326
327 let digest = if seqcol_digest.is_empty() {
328 lvl1.to_digest()
329 } else {
330 seqcol_digest
331 };
332
333 SequenceCollectionMetadata {
334 digest,
335 n_sequences: results.len(),
336 names_digest: lvl1.names_digest,
337 sequences_digest: lvl1.sequences_digest,
338 lengths_digest: lvl1.lengths_digest,
339 file_path: Some(file_path.as_ref().to_path_buf()),
340 }
341 };
342
343 Ok(SequenceCollection {
344 metadata: collection_metadata,
345 sequences: results,
346 })
347}
348
349#[cfg(test)]
354mod tests {
355 use super::*;
356 use crate::digest::encode_sequence;
357 use crate::digest::{AlphabetType, lookup_alphabet};
358 use crate::fasta::{digest_fasta, load_fasta};
359
360 #[test]
361 fn test_decode_returns_none_when_no_data() {
362 let seqcol =
363 digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
364
365 for seq_record in &seqcol.sequences {
366 assert!(!seq_record.is_loaded());
367 assert_eq!(seq_record.decode(), None);
368 }
369 }
370
371 #[test]
372 fn test_decode_with_loaded_data() {
373 let seqcol =
374 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
375
376 let expected_sequences = vec![("chrX", "TTGGGGAA"), ("chr1", "GGAA"), ("chr2", "GCGC")];
377
378 for (seq_record, (expected_name, expected_seq)) in
379 seqcol.sequences.iter().zip(expected_sequences.iter())
380 {
381 assert_eq!(seq_record.metadata().name, *expected_name);
382 assert!(seq_record.is_loaded());
383 let decoded = seq_record.decode().expect("decode() should return Some");
384 assert_eq!(decoded, *expected_seq);
385 }
386 }
387
388 #[test]
389 fn test_decode_handles_encoded_data() {
390 let sequence = b"ACGT";
391 let alphabet = lookup_alphabet(&AlphabetType::Dna2bit);
392 let encoded_data = encode_sequence(sequence, alphabet);
393
394 let record = SequenceRecord::Full {
395 metadata: SequenceMetadata {
396 name: "test_seq".to_string(),
397 description: None,
398 length: 4,
399 sha512t24u: "test_digest".to_string(),
400 md5: "test_md5".to_string(),
401 alphabet: AlphabetType::Dna2bit,
402 fai: None,
403 },
404 sequence: encoded_data,
405 };
406
407 let decoded = record.decode().expect("Should decode encoded data");
408 assert_eq!(decoded, "ACGT");
409 }
410
411 #[test]
412 fn test_sequence_collection_from_fasta() {
413 let seqcol = SequenceCollection::from_fasta("../tests/data/fasta/base.fa")
414 .expect("Failed to load FASTA");
415
416 assert_eq!(seqcol.sequences.len(), 3);
417 assert!(!seqcol.metadata.digest.is_empty());
418 }
419
420 #[test]
421 fn test_sequence_collection_write_and_read_rgsi() {
422 use tempfile::tempdir;
423
424 let seqcol =
425 digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
426
427 let dir = tempdir().expect("Failed to create temp dir");
428 let rgsi_path = dir.path().join("test.rgsi");
429
430 seqcol
431 .write_collection_rgsi(&rgsi_path)
432 .expect("Failed to write RGSI");
433
434 let loaded = read_rgsi_file(&rgsi_path).expect("Failed to read RGSI");
435
436 assert_eq!(loaded.metadata.digest, seqcol.metadata.digest);
437 assert_eq!(loaded.sequences.len(), seqcol.sequences.len());
438 }
439
440 #[test]
441 fn test_sequence_collection_write_fasta() {
442 use std::fs;
443 use tempfile::tempdir;
444
445 let seqcol =
446 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
447
448 let dir = tempdir().expect("Failed to create temp dir");
449 let fasta_path = dir.path().join("output.fa");
450
451 seqcol
452 .write_fasta(&fasta_path, None)
453 .expect("Failed to write FASTA");
454
455 let content = fs::read_to_string(&fasta_path).expect("Failed to read file");
456 assert!(content.contains(">chrX"));
457 assert!(content.contains("TTGGGGAA"));
458 }
459
460 #[test]
461 fn test_digest_sequence_basic() {
462 let seq = digest_sequence("test_seq", b"ACGTACGT");
463 assert_eq!(seq.metadata().name, "test_seq");
464 assert_eq!(seq.metadata().length, 8);
465 assert!(!seq.metadata().sha512t24u.is_empty());
466 assert!(seq.is_loaded());
467 assert_eq!(seq.sequence().unwrap(), b"ACGTACGT");
468 }
469
470 #[test]
471 fn test_digest_sequence_matches_fasta_digest() {
472 let seqcol =
473 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
474 let fasta_seq = &seqcol.sequences[0]; let prog_seq = digest_sequence("chrX", b"TTGGGGAA");
477
478 assert_eq!(
479 prog_seq.metadata().sha512t24u,
480 fasta_seq.metadata().sha512t24u
481 );
482 assert_eq!(prog_seq.metadata().md5, fasta_seq.metadata().md5);
483 }
484
485 #[test]
486 fn test_sequence_metadata_disk_size() {
487 use crate::store::StorageMode;
488
489 let metadata = SequenceMetadata {
490 name: "test".to_string(),
491 description: None,
492 length: 1000,
493 sha512t24u: "test".to_string(),
494 md5: "test".to_string(),
495 alphabet: AlphabetType::Dna2bit,
496 fai: None,
497 };
498
499 assert_eq!(metadata.disk_size(&StorageMode::Raw), 1000);
500 assert_eq!(metadata.disk_size(&StorageMode::Encoded), 250);
501 }
502
503 #[test]
504 fn test_sequence_record_to_file() {
505 use std::fs;
506 use tempfile::tempdir;
507
508 let seqcol =
509 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
510
511 let dir = tempdir().expect("Failed to create temp dir");
512 let file_path = dir.path().join("test_seq.txt");
513
514 seqcol.sequences[0]
515 .to_file(&file_path)
516 .expect("Failed to write file");
517
518 let content = fs::read(&file_path).expect("Failed to read file");
519 assert!(!content.is_empty());
520 }
521}