1use anyhow::Result;
7use std::io::Write;
8use std::path::Path;
9
10use crate::fasta::digest_fasta;
11use crate::utils::PathExtension;
12
13pub use crate::digest::types::{
15 FaiMetadata, SeqColDigestLvl1, SequenceCollection, SequenceCollectionMetadata,
16 SequenceCollectionRecord, SequenceMetadata, SequenceRecord, digest_sequence,
17 digest_sequence_with_description, parse_rgsi_line,
18};
19
20fn write_rgsi_impl<P: AsRef<Path>>(
22 file_path: P,
23 metadata: &SequenceCollectionMetadata,
24 sequences: Option<&[SequenceRecord]>,
25) -> Result<()> {
26 let file_path = file_path.as_ref();
27 let mut file = std::fs::File::create(file_path)?;
28
29 writeln!(file, "##seqcol_digest={}", metadata.digest)?;
31 writeln!(file, "##names_digest={}", metadata.names_digest)?;
32 writeln!(file, "##sequences_digest={}", metadata.sequences_digest)?;
33 writeln!(file, "##lengths_digest={}", metadata.lengths_digest)?;
34 if let Some(ref nlp) = metadata.name_length_pairs_digest {
35 writeln!(file, "##name_length_pairs_digest={}", nlp)?;
36 }
37 if let Some(ref snlp) = metadata.sorted_name_length_pairs_digest {
38 writeln!(file, "##sorted_name_length_pairs_digest={}", snlp)?;
39 }
40 if let Some(ref ss) = metadata.sorted_sequences_digest {
41 writeln!(file, "##sorted_sequences_digest={}", ss)?;
42 }
43 writeln!(
44 file,
45 "#name\tlength\talphabet\tsha512t24u\tmd5\tdescription"
46 )?;
47
48 if let Some(seqs) = sequences {
50 for seq_record in seqs {
51 let seq_meta = seq_record.metadata();
52 writeln!(
53 file,
54 "{}\t{}\t{}\t{}\t{}\t{}",
55 seq_meta.name,
56 seq_meta.length,
57 seq_meta.alphabet,
58 seq_meta.sha512t24u,
59 seq_meta.md5,
60 seq_meta.description.as_deref().unwrap_or("")
61 )?;
62 }
63 }
64 Ok(())
65}
66
67pub trait SequenceMetadataExt {
73 fn disk_size(&self, mode: &crate::store::StorageMode) -> usize;
74}
75
76impl SequenceMetadataExt for SequenceMetadata {
77 fn disk_size(&self, mode: &crate::store::StorageMode) -> usize {
79 match mode {
80 crate::store::StorageMode::Raw => self.length,
81 crate::store::StorageMode::Encoded => {
82 let bits_per_symbol = self.alphabet.bits_per_symbol();
83 let total_bits = self.length * bits_per_symbol;
84 total_bits.div_ceil(8)
85 }
86 }
87 }
88}
89
90pub trait SequenceRecordExt {
96 fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()>;
97}
98
99impl SequenceRecordExt for SequenceRecord {
100 fn to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
102 use std::fs::{self, File};
103 use std::io::Write;
104
105 let data = match self {
106 SequenceRecord::Stub(_) => {
107 return Err(anyhow::anyhow!(
108 "Cannot write file: sequence data not loaded"
109 ));
110 }
111 SequenceRecord::Full { sequence, .. } => sequence,
112 };
113
114 if let Some(parent) = path.as_ref().parent() {
116 fs::create_dir_all(parent)?;
117 }
118
119 let mut file = File::create(path)?;
120 file.write_all(data)?;
121 Ok(())
122 }
123}
124
125pub trait SequenceCollectionRecordExt {
131 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
132}
133
134impl SequenceCollectionRecordExt for SequenceCollectionRecord {
135 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
137 write_rgsi_impl(file_path, self.metadata(), self.sequences())
138 }
139}
140
141pub trait SequenceCollectionExt {
147 fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
148 fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
149 fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection>;
150 fn from_path_with_cache<P: AsRef<Path>>(
151 file_path: P,
152 read_cache: bool,
153 write_cache: bool,
154 ) -> Result<SequenceCollection>;
155 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()>;
156 fn write_rgsi(&self) -> Result<()>;
157 fn to_record(&self) -> SequenceCollectionRecord;
158 fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()>;
159}
160
161impl SequenceCollectionExt for SequenceCollection {
162 fn from_fasta<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
164 Self::from_path_with_cache(file_path, true, true)
165 }
166
167 fn from_rgsi<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
168 let rgsi_file_path = file_path.as_ref().replace_exts_with("rgsi");
169
170 if rgsi_file_path.exists() {
171 read_rgsi_file(&rgsi_file_path)
172 } else {
173 Err(anyhow::anyhow!(
174 "RGSI file does not exist at {:?}",
175 rgsi_file_path
176 ))
177 }
178 }
179
180 fn from_path_no_cache<P: AsRef<Path>>(file_path: P) -> Result<SequenceCollection> {
182 Self::from_path_with_cache(file_path, false, false)
183 }
184
185 fn from_path_with_cache<P: AsRef<Path>>(
186 file_path: P,
187 read_cache: bool,
188 write_cache: bool,
189 ) -> Result<SequenceCollection> {
190 let fa_file_path = file_path.as_ref();
191 let rgsi_file_path = fa_file_path.replace_exts_with("rgsi");
192
193 if read_cache && rgsi_file_path.exists() {
195 let seqcol = read_rgsi_file(&rgsi_file_path)?;
196 if !seqcol.sequences.is_empty() {
197 return Ok(seqcol);
198 }
199 let _ = std::fs::remove_file(&rgsi_file_path);
201 }
202
203 let seqcol: SequenceCollection = digest_fasta(file_path.as_ref())?;
205
206 if write_cache && !rgsi_file_path.exists() {
208 seqcol.write_rgsi()?;
209 }
210 Ok(seqcol)
211 }
212
213 fn write_collection_rgsi<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
215 write_rgsi_impl(file_path, &self.metadata, Some(&self.sequences))
216 }
217
218 fn write_rgsi(&self) -> Result<()> {
220 if let Some(file_path) = &self.metadata.file_path {
221 let rgsi_file_path = file_path.replace_exts_with("rgsi");
222 self.write_collection_rgsi(rgsi_file_path)
223 } else {
224 Err(anyhow::anyhow!(
225 "No file path specified for RGSI output. Use `write_collection_rgsi` to specify a file path."
226 ))
227 }
228 }
229
230 fn to_record(&self) -> SequenceCollectionRecord {
232 SequenceCollectionRecord::from(self.clone())
233 }
234
235 fn write_fasta<P: AsRef<Path>>(&self, file_path: P, line_width: Option<usize>) -> Result<()> {
237 use std::fs::File;
238
239 let line_width = line_width.unwrap_or(70);
240 let mut output_file = File::create(file_path)?;
241
242 for record in &self.sequences {
243 if !record.is_loaded() {
244 return Err(anyhow::anyhow!(
245 "Cannot write FASTA: sequence '{}' does not have data loaded",
246 record.metadata().name
247 ));
248 }
249
250 let decoded_sequence = record.decode().ok_or_else(|| {
251 anyhow::anyhow!("Failed to decode sequence '{}'", record.metadata().name)
252 })?;
253
254 let metadata = record.metadata();
256 let header = match &metadata.description {
257 Some(desc) => format!(">{} {}", metadata.name, desc),
258 None => format!(">{}", metadata.name),
259 };
260 writeln!(output_file, "{}", header)?;
261
262 for chunk in decoded_sequence.as_bytes().chunks(line_width) {
264 output_file.write_all(chunk)?;
265 output_file.write_all(b"\n")?;
266 }
267 }
268
269 Ok(())
270 }
271}
272
273pub fn read_rgsi_file<T: AsRef<Path>>(file_path: T) -> Result<SequenceCollection> {
279 use std::io::BufRead;
280
281 let file = std::fs::File::open(&file_path)?;
282 let reader = std::io::BufReader::new(file);
283 let mut results = Vec::new();
284
285 let mut seqcol_digest = String::new();
287 let mut names_digest = String::new();
288 let mut sequences_digest = String::new();
289 let mut lengths_digest = String::new();
290 let mut name_length_pairs_digest: Option<String> = None;
291 let mut sorted_name_length_pairs_digest: Option<String> = None;
292 let mut sorted_sequences_digest: Option<String> = None;
293 for line in reader.lines() {
294 let line = line?;
295
296 if line.starts_with("##") {
298 if let Some(stripped) = line.strip_prefix("##") {
299 if let Some((key, value)) = stripped.split_once('=') {
300 match key {
301 "seqcol_digest" => seqcol_digest = value.to_string(),
302 "names_digest" => names_digest = value.to_string(),
303 "sequences_digest" => sequences_digest = value.to_string(),
304 "lengths_digest" => lengths_digest = value.to_string(),
305 "name_length_pairs_digest" => name_length_pairs_digest = Some(value.to_string()),
306 "sorted_name_length_pairs_digest" => sorted_name_length_pairs_digest = Some(value.to_string()),
307 "sorted_sequences_digest" => sorted_sequences_digest = Some(value.to_string()),
308 _ => {} }
310 }
311 }
312 continue;
313 }
314
315 if line.starts_with('#') {
317 continue;
318 }
319
320 if let Some(metadata) = parse_rgsi_line(&line) {
322 results.push(SequenceRecord::Stub(metadata));
323 }
324 }
325
326 let collection_metadata = if sequences_digest.is_empty()
328 || names_digest.is_empty()
329 || lengths_digest.is_empty()
330 {
331 SequenceCollectionMetadata::from_sequences(&results, Some(file_path.as_ref().to_path_buf()))
333 } else {
334 let lvl1 = SeqColDigestLvl1 {
336 sequences_digest,
337 names_digest,
338 lengths_digest,
339 };
340
341 let digest = if seqcol_digest.is_empty() {
342 lvl1.to_digest()
343 } else {
344 seqcol_digest
345 };
346
347 SequenceCollectionMetadata {
348 digest,
349 n_sequences: results.len(),
350 names_digest: lvl1.names_digest,
351 sequences_digest: lvl1.sequences_digest,
352 lengths_digest: lvl1.lengths_digest,
353 name_length_pairs_digest,
354 sorted_name_length_pairs_digest,
355 sorted_sequences_digest,
356 file_path: Some(file_path.as_ref().to_path_buf()),
357 }
358 };
359
360 Ok(SequenceCollection {
361 metadata: collection_metadata,
362 sequences: results,
363 })
364}
365
366#[cfg(test)]
371mod tests {
372 use super::*;
373 use crate::digest::encode_sequence;
374 use crate::digest::{AlphabetType, lookup_alphabet};
375 use crate::fasta::{digest_fasta, load_fasta};
376
377 #[test]
378 fn test_decode_returns_none_when_no_data() {
379 let seqcol =
380 digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
381
382 for seq_record in &seqcol.sequences {
383 assert!(!seq_record.is_loaded());
384 assert_eq!(seq_record.decode(), None);
385 }
386 }
387
388 #[test]
389 fn test_decode_with_loaded_data() {
390 let seqcol =
391 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
392
393 let expected_sequences = vec![("chrX", "TTGGGGAA"), ("chr1", "GGAA"), ("chr2", "GCGC")];
394
395 for (seq_record, (expected_name, expected_seq)) in
396 seqcol.sequences.iter().zip(expected_sequences.iter())
397 {
398 assert_eq!(seq_record.metadata().name, *expected_name);
399 assert!(seq_record.is_loaded());
400 let decoded = seq_record.decode().expect("decode() should return Some");
401 assert_eq!(decoded, *expected_seq);
402 }
403 }
404
405 #[test]
406 fn test_decode_handles_encoded_data() {
407 let sequence = b"ACGT";
408 let alphabet = lookup_alphabet(&AlphabetType::Dna2bit);
409 let encoded_data = encode_sequence(sequence, alphabet);
410
411 let record = SequenceRecord::Full {
412 metadata: SequenceMetadata {
413 name: "test_seq".to_string(),
414 description: None,
415 length: 4,
416 sha512t24u: "test_digest".to_string(),
417 md5: "test_md5".to_string(),
418 alphabet: AlphabetType::Dna2bit,
419 fai: None,
420 },
421 sequence: encoded_data,
422 };
423
424 let decoded = record.decode().expect("Should decode encoded data");
425 assert_eq!(decoded, "ACGT");
426 }
427
428 #[test]
429 fn test_sequence_collection_from_fasta() {
430 let seqcol = SequenceCollection::from_path_no_cache("../tests/data/fasta/base.fa")
431 .expect("Failed to load FASTA");
432
433 assert_eq!(seqcol.sequences.len(), 3);
434 assert!(!seqcol.metadata.digest.is_empty());
435 }
436
437 #[test]
438 fn test_sequence_collection_write_and_read_rgsi() {
439 use tempfile::tempdir;
440
441 let seqcol =
442 digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
443
444 let dir = tempdir().expect("Failed to create temp dir");
445 let rgsi_path = dir.path().join("test.rgsi");
446
447 seqcol
448 .write_collection_rgsi(&rgsi_path)
449 .expect("Failed to write RGSI");
450
451 let loaded = read_rgsi_file(&rgsi_path).expect("Failed to read RGSI");
452
453 assert_eq!(loaded.metadata.digest, seqcol.metadata.digest);
454 assert_eq!(loaded.sequences.len(), seqcol.sequences.len());
455 }
456
457 #[test]
458 fn test_sequence_collection_write_fasta() {
459 use std::fs;
460 use tempfile::tempdir;
461
462 let seqcol =
463 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
464
465 let dir = tempdir().expect("Failed to create temp dir");
466 let fasta_path = dir.path().join("output.fa");
467
468 seqcol
469 .write_fasta(&fasta_path, None)
470 .expect("Failed to write FASTA");
471
472 let content = fs::read_to_string(&fasta_path).expect("Failed to read file");
473 assert!(content.contains(">chrX"));
474 assert!(content.contains("TTGGGGAA"));
475 }
476
477 #[test]
478 fn test_digest_sequence_basic() {
479 let seq = digest_sequence("test_seq", b"ACGTACGT");
480 assert_eq!(seq.metadata().name, "test_seq");
481 assert_eq!(seq.metadata().length, 8);
482 assert!(!seq.metadata().sha512t24u.is_empty());
483 assert!(seq.is_loaded());
484 assert_eq!(seq.sequence().unwrap(), b"ACGTACGT");
485 }
486
487 #[test]
488 fn test_digest_sequence_matches_fasta_digest() {
489 let seqcol =
490 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
491 let fasta_seq = &seqcol.sequences[0]; let prog_seq = digest_sequence("chrX", b"TTGGGGAA");
494
495 assert_eq!(
496 prog_seq.metadata().sha512t24u,
497 fasta_seq.metadata().sha512t24u
498 );
499 assert_eq!(prog_seq.metadata().md5, fasta_seq.metadata().md5);
500 }
501
502 #[test]
503 fn test_sequence_metadata_disk_size() {
504 use crate::store::StorageMode;
505
506 let metadata = SequenceMetadata {
507 name: "test".to_string(),
508 description: None,
509 length: 1000,
510 sha512t24u: "test".to_string(),
511 md5: "test".to_string(),
512 alphabet: AlphabetType::Dna2bit,
513 fai: None,
514 };
515
516 assert_eq!(metadata.disk_size(&StorageMode::Raw), 1000);
517 assert_eq!(metadata.disk_size(&StorageMode::Encoded), 250);
518 }
519
520 #[test]
521 fn test_sequence_record_to_file() {
522 use std::fs;
523 use tempfile::tempdir;
524
525 let seqcol =
526 load_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
527
528 let dir = tempdir().expect("Failed to create temp dir");
529 let file_path = dir.path().join("test_seq.txt");
530
531 seqcol.sequences[0]
532 .to_file(&file_path)
533 .expect("Failed to write file");
534
535 let content = fs::read(&file_path).expect("Failed to read file");
536 assert!(!content.is_empty());
537 }
538
539 #[test]
540 fn test_rgsi_round_trip_ancillary_digests() {
541 use tempfile::tempdir;
542
543 let mut seqcol =
544 digest_fasta("../tests/data/fasta/base.fa").expect("Failed to load test FASTA file");
545
546 seqcol
548 .metadata
549 .compute_ancillary_digests(&seqcol.sequences);
550 assert!(seqcol.metadata.name_length_pairs_digest.is_some());
551 assert!(seqcol.metadata.sorted_name_length_pairs_digest.is_some());
552 assert!(seqcol.metadata.sorted_sequences_digest.is_some());
553
554 let dir = tempdir().expect("Failed to create temp dir");
556 let rgsi_path = dir.path().join("test_ancillary.rgsi");
557 seqcol
558 .write_collection_rgsi(&rgsi_path)
559 .expect("Failed to write RGSI");
560
561 let loaded = read_rgsi_file(&rgsi_path).expect("Failed to read RGSI");
563
564 assert_eq!(
566 loaded.metadata.name_length_pairs_digest,
567 seqcol.metadata.name_length_pairs_digest
568 );
569 assert_eq!(
570 loaded.metadata.sorted_name_length_pairs_digest,
571 seqcol.metadata.sorted_name_length_pairs_digest
572 );
573 assert_eq!(
574 loaded.metadata.sorted_sequences_digest,
575 seqcol.metadata.sorted_sequences_digest
576 );
577 }
578}