1use std::collections::HashMap;
4use std::io::BufReader;
5use std::path::Path;
6
7use md5::{Digest, Md5};
8use noodles_fasta as fasta;
9use noodles_fasta::fai;
10use refget_digest::sha512t24u;
11use refget_model::{Alias, SequenceMetadata};
12use serde::{Deserialize, Serialize};
13
14use refget_model::SeqCol;
15use serde::de::DeserializeOwned;
16
17use crate::{SequenceStore, StoreError, StoreResult, extract_subsequence};
18
19pub trait SidecarCache: Serialize + DeserializeOwned + Sized {
24 const EXTENSION: &str;
26
27 fn cache_path_for<P: AsRef<Path>>(fasta_path: P) -> std::path::PathBuf {
29 let p = fasta_path.as_ref();
30 let ext = p
31 .extension()
32 .map(|e| format!("{}.{}", e.to_string_lossy(), Self::EXTENSION))
33 .unwrap_or_else(|| Self::EXTENSION.to_string());
34 p.with_extension(ext)
35 }
36
37 fn write<P: AsRef<Path>>(&self, fasta_path: P) -> StoreResult<std::path::PathBuf> {
39 let cache_path = Self::cache_path_for(&fasta_path);
40 let json = serde_json::to_string_pretty(self)
41 .map_err(|e| StoreError::Fasta(format!("Failed to serialize cache: {e}")))?;
42 std::fs::write(&cache_path, json)?;
43 Ok(cache_path)
44 }
45
46 fn load_if_fresh<P: AsRef<Path>>(fasta_path: P) -> Option<Self> {
49 let fasta_path = fasta_path.as_ref();
50 let cache_path = Self::cache_path_for(fasta_path);
51 let cache_meta = std::fs::metadata(&cache_path).ok()?;
52 let cache_mtime = cache_meta.modified().ok()?;
53
54 let fasta_mtime = std::fs::metadata(fasta_path).ok()?.modified().ok()?;
55 if cache_mtime < fasta_mtime {
56 return None;
57 }
58 let fai_path = fai_path_for(fasta_path);
59 if let Ok(fai_meta) = std::fs::metadata(&fai_path)
60 && let Ok(fai_mtime) = fai_meta.modified()
61 && cache_mtime < fai_mtime
62 {
63 return None;
64 }
65
66 let data = std::fs::read_to_string(&cache_path).ok()?;
67 serde_json::from_str(&data).ok()
68 }
69}
70
71pub struct FastaSequenceStore {
76 digest_index: HashMap<String, usize>,
78 records: Vec<RecordData>,
80}
81
82struct RecordData {
83 _name: String,
84 sequence: Vec<u8>,
85 metadata: SequenceMetadata,
86}
87
88#[derive(Debug, Clone)]
91pub struct FastaSequenceSummary {
92 pub name: String,
93 pub length: u64,
94 pub md5: String,
95 pub sha512t24u: String,
96 pub circular: bool,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct DigestCache {
105 pub sequences: Vec<CachedDigest>,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct CachedDigest {
111 pub name: String,
112 pub length: u64,
113 pub md5: String,
114 pub sha512t24u: String,
116 #[serde(default, skip_serializing_if = "std::ops::Not::not")]
118 pub circular: bool,
119}
120
121impl CachedDigest {
122 pub fn to_metadata(&self) -> SequenceMetadata {
124 SequenceMetadata {
125 md5: self.md5.clone(),
126 sha512t24u: self.sha512t24u.clone(),
127 length: self.length,
128 aliases: vec![Alias {
129 naming_authority: "refseq".to_string(),
130 value: self.name.clone(),
131 }],
132 circular: self.circular,
133 }
134 }
135
136 pub fn to_summary(&self) -> FastaSequenceSummary {
138 FastaSequenceSummary {
139 name: self.name.clone(),
140 length: self.length,
141 md5: self.md5.clone(),
142 sha512t24u: self.sha512t24u.clone(),
143 circular: self.circular,
144 }
145 }
146}
147
148impl SidecarCache for DigestCache {
149 const EXTENSION: &str = "refget.json";
150}
151
152impl DigestCache {
153 pub fn from_fasta<P: AsRef<Path>>(path: P) -> StoreResult<Self> {
155 let path = path.as_ref();
156 let index = read_fai_index(path)?;
157 let index_records: &[fai::Record] = index.as_ref();
158
159 let file = std::fs::File::open(path)?;
160 let mut reader = fasta::io::Reader::new(BufReader::new(file));
161 let mut sequences = Vec::new();
162
163 for (idx, result) in reader.records().enumerate() {
164 let record = result
165 .map_err(|e| StoreError::Fasta(format!("Failed to read FASTA record: {e}")))?;
166
167 let name = std::str::from_utf8(record.name())
168 .map_err(|e| StoreError::Fasta(format!("Invalid UTF-8 in sequence name: {e}")))?
169 .to_string();
170 let seq_bytes: Vec<u8> = record
171 .sequence()
172 .as_ref()
173 .iter()
174 .copied()
175 .filter(|b| !b.is_ascii_whitespace())
176 .map(|b| b.to_ascii_uppercase())
177 .collect();
178
179 let length = seq_bytes.len() as u64;
180
181 if let Some(fai_record) = index_records.get(idx) {
182 let expected_length = fai_record.length() as usize;
183 if expected_length != seq_bytes.len() {
184 return Err(StoreError::Fasta(format!(
185 "Sequence length mismatch for {name}: index says {expected_length}, got {}",
186 seq_bytes.len()
187 )));
188 }
189 }
190
191 let md5 = format!("{:x}", Md5::digest(&seq_bytes));
192 let sha512t24u = format!("SQ.{}", sha512t24u(&seq_bytes));
193
194 sequences.push(CachedDigest { name, length, md5, sha512t24u, circular: false });
195 }
196
197 Ok(Self { sequences })
198 }
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct SeqColCache {
207 pub collection: SeqCol,
208}
209
210impl SidecarCache for SeqColCache {
211 const EXTENSION: &str = "seqcol.json";
212}
213
214impl SeqColCache {
215 pub fn from_summaries(summaries: &[FastaSequenceSummary]) -> Self {
217 let collection = SeqCol {
218 names: summaries.iter().map(|s| s.name.clone()).collect(),
219 lengths: summaries.iter().map(|s| s.length).collect(),
220 sequences: summaries.iter().map(|s| s.sha512t24u.clone()).collect(),
221 sorted_name_length_pairs: None,
222 };
223 Self { collection }
224 }
225}
226
227pub(crate) fn fai_path_for(path: &Path) -> std::path::PathBuf {
229 path.with_extension(
230 path.extension()
231 .map(|e| format!("{}.fai", e.to_string_lossy()))
232 .unwrap_or_else(|| "fai".to_string()),
233 )
234}
235
236pub(crate) fn read_fai_index(path: &Path) -> StoreResult<fai::Index> {
238 let fai_path = fai_path_for(path);
239 if !fai_path.exists() {
240 return Err(StoreError::Fasta(format!("FASTA index not found: {}", fai_path.display())));
241 }
242 let fai_reader = BufReader::new(std::fs::File::open(&fai_path)?);
243 fai::Reader::new(fai_reader)
244 .read_index()
245 .map_err(|e| StoreError::Fasta(format!("Failed to read FASTA index: {e}")))
246}
247
248pub(crate) fn index_digests(
250 digest_index: &mut HashMap<String, usize>,
251 cached: &CachedDigest,
252 record_idx: usize,
253) {
254 digest_index.insert(cached.md5.clone(), record_idx);
255 digest_index.insert(cached.sha512t24u.clone(), record_idx);
256 let bare_sha = cached.sha512t24u.strip_prefix("SQ.").unwrap_or(&cached.sha512t24u).to_string();
257 digest_index.insert(bare_sha, record_idx);
258}
259
260impl FastaSequenceStore {
261 pub fn new() -> Self {
263 Self { digest_index: HashMap::new(), records: Vec::new() }
264 }
265
266 pub fn add_fasta<P: AsRef<Path>>(&mut self, path: P) -> StoreResult<Vec<FastaSequenceSummary>> {
271 let path = path.as_ref();
272 let cache = DigestCache::load_if_fresh(path);
273 let computed;
274 let cache = match &cache {
275 Some(c) => c,
276 None => {
277 computed = DigestCache::from_fasta(path)?;
278 &computed
279 }
280 };
281 self.add_fasta_with_cache(path, cache)
282 }
283
284 fn add_fasta_with_cache(
286 &mut self,
287 path: &Path,
288 cache: &DigestCache,
289 ) -> StoreResult<Vec<FastaSequenceSummary>> {
290 let file = std::fs::File::open(path)?;
291 let mut reader = fasta::io::Reader::new(BufReader::new(file));
292 let mut summaries = Vec::new();
293
294 for (idx, result) in reader.records().enumerate() {
295 let record = result
296 .map_err(|e| StoreError::Fasta(format!("Failed to read FASTA record: {e}")))?;
297
298 let seq_bytes: Vec<u8> = record
299 .sequence()
300 .as_ref()
301 .iter()
302 .copied()
303 .filter(|b| !b.is_ascii_whitespace())
304 .map(|b| b.to_ascii_uppercase())
305 .collect();
306
307 let cached = cache.sequences.get(idx).ok_or_else(|| {
308 StoreError::Fasta(format!(
309 "Digest cache has {} entries but FASTA has more sequences",
310 cache.sequences.len()
311 ))
312 })?;
313
314 if cached.length != seq_bytes.len() as u64 {
315 return Err(StoreError::Fasta(format!(
316 "Cache length mismatch for {}: cache says {}, got {}",
317 cached.name,
318 cached.length,
319 seq_bytes.len()
320 )));
321 }
322
323 let metadata = cached.to_metadata();
324 summaries.push(cached.to_summary());
325
326 let record_idx = self.records.len();
327 index_digests(&mut self.digest_index, cached, record_idx);
328 self.records.push(RecordData {
329 _name: cached.name.clone(),
330 sequence: seq_bytes,
331 metadata,
332 });
333 }
334
335 Ok(summaries)
336 }
337
338 pub fn mark_circular(&mut self, circular_names: &[String]) {
340 for record in &mut self.records {
341 if circular_names.iter().any(|n| n == &record._name) {
342 record.metadata.circular = true;
343 }
344 }
345 }
346
347 pub fn from_fasta<P: AsRef<Path>>(path: P) -> StoreResult<(Self, Vec<FastaSequenceSummary>)> {
349 let mut store = Self::new();
350 let summaries = store.add_fasta(path)?;
351 Ok((store, summaries))
352 }
353}
354
355impl Default for FastaSequenceStore {
356 fn default() -> Self {
357 Self::new()
358 }
359}
360
361impl SequenceStore for FastaSequenceStore {
362 fn get_sequence(
363 &self,
364 digest: &str,
365 start: Option<u64>,
366 end: Option<u64>,
367 ) -> StoreResult<Option<Vec<u8>>> {
368 let Some(&record_idx) = self.digest_index.get(digest) else {
369 return Ok(None);
370 };
371 let seq = &self.records[record_idx].sequence;
372 Ok(Some(extract_subsequence(seq, start, end)))
373 }
374
375 fn get_metadata(&self, digest: &str) -> StoreResult<Option<SequenceMetadata>> {
376 Ok(self.digest_index.get(digest).map(|&idx| self.records[idx].metadata.clone()))
377 }
378
379 fn get_length(&self, digest: &str) -> StoreResult<Option<u64>> {
380 Ok(self.digest_index.get(digest).map(|&idx| self.records[idx].metadata.length))
381 }
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387 use std::io::Write;
388 use tempfile::TempDir;
389
390 fn write_test_fasta(dir: &TempDir) -> std::path::PathBuf {
391 let fasta_path = dir.path().join("test.fa");
392 let mut f = std::fs::File::create(&fasta_path).unwrap();
393 writeln!(f, ">seq1").unwrap();
394 writeln!(f, "ACGTACGT").unwrap();
395 writeln!(f, ">seq2").unwrap();
396 writeln!(f, "NNNN").unwrap();
397
398 let fai_path = dir.path().join("test.fa.fai");
399 let mut fai = std::fs::File::create(&fai_path).unwrap();
400 writeln!(fai, "seq1\t8\t6\t8\t9").unwrap();
401 writeln!(fai, "seq2\t4\t21\t4\t5").unwrap();
402
403 fasta_path
404 }
405
406 #[test]
407 fn test_load_fasta() {
408 let dir = TempDir::new().unwrap();
409 let fasta_path = write_test_fasta(&dir);
410
411 let (store, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
412 assert_eq!(summaries.len(), 2);
413 assert_eq!(summaries[0].name, "seq1");
414 assert_eq!(summaries[0].length, 8);
415 assert_eq!(summaries[1].name, "seq2");
416 assert_eq!(summaries[1].length, 4);
417
418 let seq = store.get_sequence(&summaries[0].sha512t24u, None, None).unwrap().unwrap();
419 assert_eq!(seq, b"ACGTACGT");
420
421 let seq = store.get_sequence(&summaries[0].md5, None, None).unwrap().unwrap();
422 assert_eq!(seq, b"ACGTACGT");
423 }
424
425 #[test]
426 fn test_add_multiple_fastas() {
427 let dir = TempDir::new().unwrap();
428
429 let fa1 = dir.path().join("a.fa");
431 let mut f = std::fs::File::create(&fa1).unwrap();
432 writeln!(f, ">seq1\nACGT").unwrap();
433 let mut fai = std::fs::File::create(dir.path().join("a.fa.fai")).unwrap();
434 writeln!(fai, "seq1\t4\t6\t4\t5").unwrap();
435
436 let fa2 = dir.path().join("b.fa");
438 let mut f = std::fs::File::create(&fa2).unwrap();
439 writeln!(f, ">seq2\nTTTT").unwrap();
440 let mut fai = std::fs::File::create(dir.path().join("b.fa.fai")).unwrap();
441 writeln!(fai, "seq2\t4\t6\t4\t5").unwrap();
442
443 let mut store = FastaSequenceStore::new();
444 let s1 = store.add_fasta(&fa1).unwrap();
445 let s2 = store.add_fasta(&fa2).unwrap();
446
447 assert_eq!(s1.len(), 1);
448 assert_eq!(s2.len(), 1);
449
450 let seq1 = store.get_sequence(&s1[0].sha512t24u, None, None).unwrap().unwrap();
451 assert_eq!(seq1, b"ACGT");
452 let seq2 = store.get_sequence(&s2[0].sha512t24u, None, None).unwrap().unwrap();
453 assert_eq!(seq2, b"TTTT");
454 }
455
456 #[test]
457 fn test_subsequence() {
458 let dir = TempDir::new().unwrap();
459 let fasta_path = write_test_fasta(&dir);
460
461 let (store, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
462 let seq = store.get_sequence(&summaries[0].sha512t24u, Some(2), Some(5)).unwrap().unwrap();
463 assert_eq!(seq, b"GTA");
464 }
465
466 #[test]
467 fn test_metadata() {
468 let dir = TempDir::new().unwrap();
469 let fasta_path = write_test_fasta(&dir);
470
471 let (store, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
472 let meta = store.get_metadata(&summaries[0].sha512t24u).unwrap().unwrap();
473 assert_eq!(meta.length, 8);
474 assert_eq!(meta.sha512t24u, summaries[0].sha512t24u);
475 assert!(meta.sha512t24u.starts_with("SQ."), "ga4gh digest must have SQ. prefix");
476 assert_eq!(meta.md5, summaries[0].md5);
477 }
478
479 #[test]
480 fn test_from_fasta_missing_fai_errors() {
481 let dir = TempDir::new().unwrap();
482 let fasta_path = dir.path().join("no_index.fa");
483 let mut f = std::fs::File::create(&fasta_path).unwrap();
484 writeln!(f, ">seq1").unwrap();
485 writeln!(f, "ACGT").unwrap();
486
487 let result = FastaSequenceStore::from_fasta(&fasta_path);
488 let err_msg = format!("{}", result.err().expect("Expected an error for missing .fai file"));
489 assert!(err_msg.contains("FASTA index not found"), "Unexpected error: {err_msg}");
490 }
491
492 #[test]
493 fn test_from_fasta_empty_fasta() {
494 let dir = TempDir::new().unwrap();
495 let fasta_path = dir.path().join("empty.fa");
496 std::fs::File::create(&fasta_path).unwrap();
497 std::fs::File::create(dir.path().join("empty.fa.fai")).unwrap();
498
499 let (store, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
500 assert!(summaries.is_empty());
501 assert!(store.get_sequence("anything", None, None).unwrap().is_none());
502 }
503
504 #[test]
505 fn test_get_sequence_non_existent_digest_returns_none() {
506 let dir = TempDir::new().unwrap();
507 let fasta_path = write_test_fasta(&dir);
508
509 let (store, _) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
510 assert!(store.get_sequence("no_such_digest", None, None).unwrap().is_none());
511 }
512
513 #[test]
514 fn test_get_sequence_start_beyond_length_returns_empty() {
515 let dir = TempDir::new().unwrap();
516 let fasta_path = write_test_fasta(&dir);
517
518 let (store, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
519 let seq = store.get_sequence(&summaries[0].sha512t24u, Some(100), None).unwrap().unwrap();
520 assert!(seq.is_empty());
521 }
522
523 #[test]
524 fn test_lowercase_sequences_are_uppercased() {
525 let dir = TempDir::new().unwrap();
526 let fasta_path = dir.path().join("lower.fa");
527 let mut f = std::fs::File::create(&fasta_path).unwrap();
528 writeln!(f, ">seq1").unwrap();
529 writeln!(f, "acgtACGT").unwrap();
530
531 let fai_path = dir.path().join("lower.fa.fai");
532 let mut fai = std::fs::File::create(&fai_path).unwrap();
533 writeln!(fai, "seq1\t8\t6\t8\t9").unwrap();
534
535 let (store, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
536 let seq = store.get_sequence(&summaries[0].sha512t24u, None, None).unwrap().unwrap();
537 assert_eq!(seq, b"ACGTACGT");
538 }
539
540 #[test]
541 fn test_digest_cache_write_and_load() {
542 let dir = TempDir::new().unwrap();
543 let fasta_path = write_test_fasta(&dir);
544
545 let cache = DigestCache::from_fasta(&fasta_path).unwrap();
546 assert_eq!(cache.sequences.len(), 2);
547 let cache_path = cache.write(&fasta_path).unwrap();
548 assert!(cache_path.exists());
549 assert!(cache_path.to_string_lossy().ends_with(".refget.json"));
550
551 let loaded = DigestCache::load_if_fresh(&fasta_path).unwrap();
552 assert_eq!(loaded.sequences.len(), 2);
553 assert_eq!(loaded.sequences[0].name, "seq1");
554 assert!(loaded.sequences[0].sha512t24u.starts_with("SQ."));
555 }
556
557 #[test]
558 fn test_from_fasta_with_cache_matches_without() {
559 let dir = TempDir::new().unwrap();
560 let fasta_path = write_test_fasta(&dir);
561
562 let (_, summaries_no_cache) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
563
564 let cache = DigestCache::from_fasta(&fasta_path).unwrap();
565 cache.write(&fasta_path).unwrap();
566 let (store, summaries_cached) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
567
568 assert_eq!(summaries_no_cache.len(), summaries_cached.len());
569 for (a, b) in summaries_no_cache.iter().zip(summaries_cached.iter()) {
570 assert_eq!(a.name, b.name);
571 assert_eq!(a.length, b.length);
572 assert_eq!(a.md5, b.md5);
573 assert_eq!(a.sha512t24u, b.sha512t24u);
574 }
575
576 let seq = store.get_sequence(&summaries_cached[0].sha512t24u, None, None).unwrap().unwrap();
577 assert_eq!(seq, b"ACGTACGT");
578 }
579
580 #[test]
581 fn test_stale_cache_is_ignored() {
582 let dir = TempDir::new().unwrap();
583 let fasta_path = write_test_fasta(&dir);
584
585 let cache = DigestCache::from_fasta(&fasta_path).unwrap();
586 cache.write(&fasta_path).unwrap();
587
588 std::thread::sleep(std::time::Duration::from_millis(50));
589 let mut f = std::fs::OpenOptions::new().append(true).open(&fasta_path).unwrap();
590 writeln!(f, ">seq3").unwrap();
591 writeln!(f, "TTTT").unwrap();
592
593 assert!(DigestCache::load_if_fresh(&fasta_path).is_none());
594 }
595
596 #[test]
597 fn test_seqcol_cache_write_and_load() {
598 let dir = TempDir::new().unwrap();
599 let fasta_path = write_test_fasta(&dir);
600
601 let (_, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
602 let cache = SeqColCache::from_summaries(&summaries);
603 assert_eq!(cache.collection.names.len(), 2);
604 assert_eq!(cache.collection.names[0], "seq1");
605 assert_eq!(cache.collection.lengths[0], 8);
606
607 let cache_path = cache.write(&fasta_path).unwrap();
608 assert!(cache_path.exists());
609 assert!(cache_path.to_string_lossy().ends_with(".seqcol.json"));
610
611 let loaded = SeqColCache::load_if_fresh(&fasta_path).unwrap();
612 assert_eq!(loaded.collection.names, cache.collection.names);
613 assert_eq!(loaded.collection.lengths, cache.collection.lengths);
614 assert_eq!(loaded.collection.sequences, cache.collection.sequences);
615 assert_eq!(loaded.collection.digest(), cache.collection.digest());
616 }
617
618 #[test]
619 fn test_seqcol_cache_stale_is_ignored() {
620 let dir = TempDir::new().unwrap();
621 let fasta_path = write_test_fasta(&dir);
622
623 let (_, summaries) = FastaSequenceStore::from_fasta(&fasta_path).unwrap();
624 let cache = SeqColCache::from_summaries(&summaries);
625 cache.write(&fasta_path).unwrap();
626
627 std::thread::sleep(std::time::Duration::from_millis(50));
628 let mut f = std::fs::OpenOptions::new().append(true).open(&fasta_path).unwrap();
629 writeln!(f, ">seq3").unwrap();
630 writeln!(f, "TTTT").unwrap();
631
632 assert!(SeqColCache::load_if_fresh(&fasta_path).is_none());
633 }
634
635 #[test]
636 fn test_seqcol_cache_path_for() {
637 assert_eq!(
638 SeqColCache::cache_path_for("test.fa"),
639 std::path::PathBuf::from("test.fa.seqcol.json")
640 );
641 assert_eq!(
642 SeqColCache::cache_path_for("dir/genome.fasta"),
643 std::path::PathBuf::from("dir/genome.fasta.seqcol.json")
644 );
645 }
646}