Skip to main content

gtars_refget/
seqcol.rs

1//! Seqcol spec operations: comparison, level-based retrieval, attribute search.
2//!
3//! These methods are defined on `RefgetStore` but live in this file for
4//! organizational clarity. They implement the GA4GH Sequence Collections
5//! specification endpoints (level 1/2 retrieval, comparison, attribute search).
6
7use crate::digest::{CollectionLevel1, CollectionLevel2, SeqColComparison, SequenceCollectionMetadata};
8use crate::digest::types::{compare_arrays, level2_to_comparison_arrays};
9use crate::hashkeyable::HashKeyable;
10use crate::store::{PagedResult, ReadonlyRefgetStore};
11use anyhow::{anyhow, Result};
12
13/// Trait defining the read-only operations a seqcol API server needs from its backend.
14///
15/// This is the canonical interface for seqcol server operations. Any backend
16/// (filesystem-backed `ReadonlyRefgetStore`, Postgres, remote proxy, in-memory mock)
17/// can implement this trait and be used via `Arc<dyn SeqColService>`.
18///
19/// All methods take `&self` and return concrete types, making the trait object-safe.
20pub trait SeqColService {
21    /// Level 1: attribute digests only.
22    fn get_collection_level1(&self, digest: &str) -> Result<CollectionLevel1>;
23
24    /// Level 2: full arrays with values.
25    fn get_collection_level2(&self, digest: &str) -> Result<CollectionLevel2>;
26
27    /// Compare two collections by digest.
28    fn compare(&self, digest_a: &str, digest_b: &str) -> Result<SeqColComparison>;
29
30    /// Compare a stored collection against an externally-provided level-2 body.
31    fn compare_with_level2(
32        &self,
33        digest_a: &str,
34        external: &CollectionLevel2,
35    ) -> Result<SeqColComparison>;
36
37    /// Find collections sharing an attribute digest.
38    fn find_collections_by_attribute(
39        &self,
40        attr_name: &str,
41        attr_digest: &str,
42    ) -> Result<Vec<String>>;
43
44    /// Get the raw attribute array for a given attribute digest.
45    fn get_attribute(
46        &self,
47        attr_name: &str,
48        attr_digest: &str,
49    ) -> Result<Option<serde_json::Value>>;
50
51    /// List collections with pagination and optional attribute filters.
52    fn list_collections(
53        &self,
54        page: usize,
55        page_size: usize,
56        filters: &[(&str, &str)],
57    ) -> Result<PagedResult<SequenceCollectionMetadata>>;
58
59    /// Total number of collections in the store.
60    fn collection_count(&self) -> usize;
61}
62
63/// Check if a collection's metadata matches a single attribute filter.
64/// Returns Err if attr_name is unrecognized.
65pub(crate) fn metadata_matches_attribute(
66    meta: &SequenceCollectionMetadata,
67    attr_name: &str,
68    attr_digest: &str,
69) -> Result<bool> {
70    match attr_name {
71        "names" => Ok(meta.names_digest == attr_digest),
72        "lengths" => Ok(meta.lengths_digest == attr_digest),
73        "sequences" => Ok(meta.sequences_digest == attr_digest),
74        "name_length_pairs" => Ok(meta
75            .name_length_pairs_digest
76            .as_deref()
77            .map_or(false, |d| d == attr_digest)),
78        "sorted_name_length_pairs" => Ok(meta
79            .sorted_name_length_pairs_digest
80            .as_deref()
81            .map_or(false, |d| d == attr_digest)),
82        "sorted_sequences" => Ok(meta
83            .sorted_sequences_digest
84            .as_deref()
85            .map_or(false, |d| d == attr_digest)),
86        _ => Err(anyhow!(
87            "Unknown attribute: '{}'. Supported: names, lengths, sequences, \
88             name_length_pairs, sorted_name_length_pairs, sorted_sequences",
89            attr_name
90        )),
91    }
92}
93
94/// Warn users when brute-force scanning more than this many collections.
95const ATTRIBUTE_SEARCH_WARN_THRESHOLD: usize = 10_000;
96
97/// Error if brute-force scanning would exceed this many collections.
98const ATTRIBUTE_SEARCH_ERROR_THRESHOLD: usize = 100_000;
99
100impl ReadonlyRefgetStore {
101    /// Enable computation and storage of ancillary digests (nlp, snlp, sorted_sequences).
102    pub fn enable_ancillary_digests(&mut self) {
103        self.ancillary_digests = true;
104    }
105
106    /// Disable computation and storage of ancillary digests.
107    pub fn disable_ancillary_digests(&mut self) {
108        self.ancillary_digests = false;
109    }
110
111    /// Returns whether ancillary digests are enabled.
112    pub fn has_ancillary_digests(&self) -> bool {
113        self.ancillary_digests
114    }
115
116    /// Returns whether the on-disk attribute index is enabled.
117    pub fn has_attribute_index(&self) -> bool {
118        self.attribute_index
119    }
120
121    /// Get collection at level 1 representation (attribute digests with spec field names).
122    /// This is a lightweight operation that only reads metadata, no loading needed.
123    pub fn get_collection_level1(&self, digest: &str) -> Result<CollectionLevel1> {
124        let key = digest.to_key();
125        let record = self
126            .collections
127            .get(&key)
128            .ok_or_else(|| anyhow!("Collection not found: {}", digest))?;
129        Ok(record.metadata().to_level1())
130    }
131
132    /// Get collection at level 2 representation (full arrays, spec format).
133    /// May need to load the collection from disk/remote.
134    pub fn get_collection_level2(&self, digest: &str) -> Result<CollectionLevel2> {
135        let collection = self.get_collection(digest)?;
136        Ok(collection.to_level2())
137    }
138
139    /// Compare two collections by digest. Both must be preloaded.
140    pub fn compare(&self, digest_a: &str, digest_b: &str) -> Result<SeqColComparison> {
141        let coll_a = self.get_collection(digest_a)?;
142        let coll_b = self.get_collection(digest_b)?;
143        Ok(coll_a.compare(&coll_b))
144    }
145
146    /// Compare a stored collection (by digest) against an externally-provided level-2 body.
147    ///
148    /// Used for the seqcol spec `POST /comparison/:digest1` endpoint where the client
149    /// submits a local collection as JSON rather than referencing a stored digest.
150    ///
151    /// The returned `SeqColComparison` has `digests.a` set to the stored collection's
152    /// digest and `digests.b` set to `None` because the external collection has no
153    /// server-side digest.
154    pub fn compare_with_level2(
155        &self,
156        digest_a: &str,
157        external: &CollectionLevel2,
158    ) -> Result<SeqColComparison> {
159        let coll_a = self.get_collection(digest_a)?;
160        let arrays_a = coll_a.to_comparison_arrays();
161        let arrays_b = level2_to_comparison_arrays(external);
162        Ok(compare_arrays(
163            arrays_a,
164            arrays_b,
165            coll_a.metadata.digest.clone(),
166            None,
167        ))
168    }
169
170    /// Find all collections with a specific attribute digest.
171    ///
172    /// Dispatches to indexed lookup (if attribute_index enabled) or
173    /// brute-force metadata scan (default).
174    ///
175    /// Supported attr_name values: "names", "lengths", "sequences",
176    /// "name_length_pairs", "sorted_name_length_pairs", "sorted_sequences"
177    pub fn find_collections_by_attribute(
178        &self,
179        attr_name: &str,
180        attr_digest: &str,
181    ) -> Result<Vec<String>> {
182        if self.attribute_index {
183            self.find_collections_by_attribute_indexed(attr_name, attr_digest)
184        } else {
185            self.find_collections_by_attribute_scan(attr_name, attr_digest)
186        }
187    }
188
189    /// Brute-force scan of collection metadata.
190    /// Warns at 10k collections, errors at 100k collections.
191    fn find_collections_by_attribute_scan(
192        &self,
193        attr_name: &str,
194        attr_digest: &str,
195    ) -> Result<Vec<String>> {
196        let count = self.collections.len();
197
198        if count > ATTRIBUTE_SEARCH_ERROR_THRESHOLD {
199            return Err(anyhow!(
200                "Brute-force attribute search is limited to {} collections ({} in store). \
201                 Indexed attribute lookup is planned for a future release.",
202                ATTRIBUTE_SEARCH_ERROR_THRESHOLD,
203                count
204            ));
205        }
206
207        if count > ATTRIBUTE_SEARCH_WARN_THRESHOLD {
208            eprintln!(
209                "Warning: brute-force attribute search scanning {} collections. \
210                 This may be slow.",
211                count
212            );
213        }
214
215        let mut results = Vec::new();
216        for record in self.collections.values() {
217            let meta = record.metadata();
218            if metadata_matches_attribute(meta, attr_name, attr_digest)? {
219                results.push(meta.digest.clone());
220            }
221        }
222        Ok(results)
223    }
224
225    /// Get the raw attribute array for a given attribute digest.
226    /// Finds a collection with this attribute (via search), loads it,
227    /// and extracts the array.
228    ///
229    /// Supported attr_name values: "names", "lengths", "sequences",
230    /// "name_length_pairs", "sorted_name_length_pairs", "sorted_sequences"
231    ///
232    /// Returns the array as a serde_json::Value (array of strings or numbers).
233    /// Returns Ok(None) if no collection has this attribute digest.
234    pub fn get_attribute(
235        &self,
236        attr_name: &str,
237        attr_digest: &str,
238    ) -> Result<Option<serde_json::Value>> {
239        let collections = self.find_collections_by_attribute(attr_name, attr_digest)?;
240        if collections.is_empty() {
241            return Ok(None);
242        }
243
244        // Load the first matching collection
245        let collection = self.get_collection(&collections[0])?;
246        let lvl2 = collection.to_level2();
247
248        let value = match attr_name {
249            "names" => serde_json::Value::Array(
250                lvl2.names
251                    .iter()
252                    .map(|s| serde_json::Value::String(s.clone()))
253                    .collect(),
254            ),
255            "lengths" => serde_json::Value::Array(
256                lvl2.lengths
257                    .iter()
258                    .map(|l| serde_json::Value::Number(serde_json::Number::from(*l)))
259                    .collect(),
260            ),
261            "sequences" => serde_json::Value::Array(
262                lvl2.sequences
263                    .iter()
264                    .map(|s| serde_json::Value::String(s.clone()))
265                    .collect(),
266            ),
267            "sorted_sequences" => serde_json::Value::Array(
268                collection
269                    .build_sorted_sequences()
270                    .into_iter()
271                    .map(serde_json::Value::String)
272                    .collect(),
273            ),
274            "name_length_pairs" => {
275                serde_json::Value::Array(collection.build_name_length_pairs())
276            }
277            "sorted_name_length_pairs" => {
278                serde_json::Value::Array(collection.build_sorted_name_length_pairs())
279            }
280            _ => {
281                return Err(anyhow!(
282                    "Unknown attribute: '{}'. Supported: names, lengths, sequences, \
283                     name_length_pairs, sorted_name_length_pairs, sorted_sequences",
284                    attr_name
285                ))
286            }
287        };
288
289        Ok(Some(value))
290    }
291
292    /// Enable indexed attribute lookup (not yet implemented).
293    ///
294    /// Note: The indexed lookup feature is planned for a future release.
295    /// Enabling this will cause `find_collections_by_attribute()` to return
296    /// a "not implemented" error until the feature is complete.
297    pub fn enable_attribute_index(&mut self) {
298        self.attribute_index = true;
299    }
300
301    /// Disable indexed attribute lookup, using brute-force scan instead.
302    pub fn disable_attribute_index(&mut self) {
303        self.attribute_index = false;
304    }
305
306    /// Indexed lookup from on-disk reverse index.
307    /// Stub: not yet implemented. Planned for a future release.
308    fn find_collections_by_attribute_indexed(
309        &self,
310        _attr_name: &str,
311        _attr_digest: &str,
312    ) -> Result<Vec<String>> {
313        Err(anyhow!(
314            "Indexed attribute lookup is not yet implemented. \
315             This feature is planned for a future release. \
316             For now, use the brute-force scan by keeping attribute_index disabled."
317        ))
318    }
319
320    /// Total number of collections in the store.
321    pub fn collection_count(&self) -> usize {
322        self.collections.len()
323    }
324}
325
326impl SeqColService for ReadonlyRefgetStore {
327    fn get_collection_level1(&self, digest: &str) -> Result<CollectionLevel1> {
328        ReadonlyRefgetStore::get_collection_level1(self, digest)
329    }
330
331    fn get_collection_level2(&self, digest: &str) -> Result<CollectionLevel2> {
332        ReadonlyRefgetStore::get_collection_level2(self, digest)
333    }
334
335    fn compare(&self, digest_a: &str, digest_b: &str) -> Result<SeqColComparison> {
336        ReadonlyRefgetStore::compare(self, digest_a, digest_b)
337    }
338
339    fn compare_with_level2(
340        &self,
341        digest_a: &str,
342        external: &CollectionLevel2,
343    ) -> Result<SeqColComparison> {
344        ReadonlyRefgetStore::compare_with_level2(self, digest_a, external)
345    }
346
347    fn find_collections_by_attribute(
348        &self,
349        attr_name: &str,
350        attr_digest: &str,
351    ) -> Result<Vec<String>> {
352        ReadonlyRefgetStore::find_collections_by_attribute(self, attr_name, attr_digest)
353    }
354
355    fn get_attribute(
356        &self,
357        attr_name: &str,
358        attr_digest: &str,
359    ) -> Result<Option<serde_json::Value>> {
360        ReadonlyRefgetStore::get_attribute(self, attr_name, attr_digest)
361    }
362
363    fn list_collections(
364        &self,
365        page: usize,
366        page_size: usize,
367        filters: &[(&str, &str)],
368    ) -> Result<PagedResult<SequenceCollectionMetadata>> {
369        ReadonlyRefgetStore::list_collections(self, page, page_size, filters)
370    }
371
372    fn collection_count(&self) -> usize {
373        ReadonlyRefgetStore::collection_count(self)
374    }
375}
376
377#[cfg(test)]
378mod tests {
379    use crate::store::{FastaImportOptions, ReadonlyRefgetStore, RefgetStore};
380    use std::path::PathBuf;
381
382    /// Copy a test FASTA to a temp directory to avoid writing RGSI cache files
383    /// into the test data directory.
384    fn copy_test_fasta(temp_dir: &std::path::Path, name: &str) -> PathBuf {
385        let src = format!("../tests/data/fasta/{}", name);
386        let dst = temp_dir.join(name);
387        std::fs::copy(&src, &dst)
388            .unwrap_or_else(|e| panic!("Failed to copy {} to tempdir: {}", src, e));
389        dst
390    }
391
392    #[test]
393    fn test_ancillary_digests_computed() {
394        let mut store = RefgetStore::in_memory();
395        assert!(store.has_ancillary_digests());
396
397        let (metadata, _) = store
398            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
399            .unwrap();
400
401        // Ancillary digests should be present
402        assert!(metadata.name_length_pairs_digest.is_some());
403        assert!(metadata.sorted_name_length_pairs_digest.is_some());
404        assert!(metadata.sorted_sequences_digest.is_some());
405
406        // The stored collection should also have them
407        let coll_meta = store.get_collection_metadata(&metadata.digest).unwrap();
408        assert!(coll_meta.name_length_pairs_digest.is_some());
409        assert!(coll_meta.sorted_name_length_pairs_digest.is_some());
410        assert!(coll_meta.sorted_sequences_digest.is_some());
411    }
412
413    #[test]
414    fn test_ancillary_digests_disabled() {
415        let mut store = RefgetStore::in_memory();
416        store.disable_ancillary_digests();
417        assert!(!store.has_ancillary_digests());
418
419        let (metadata, _) = store
420            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
421            .unwrap();
422
423        // Ancillary digests should NOT be present
424        assert!(metadata.name_length_pairs_digest.is_none());
425        assert!(metadata.sorted_name_length_pairs_digest.is_none());
426        assert!(metadata.sorted_sequences_digest.is_none());
427    }
428
429    #[test]
430    fn test_collection_level1() {
431        let mut store = RefgetStore::in_memory();
432        let (metadata, _) = store
433            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
434            .unwrap();
435
436        let lvl1 = store.get_collection_level1(&metadata.digest).unwrap();
437        assert_eq!(lvl1.names, metadata.names_digest);
438        assert_eq!(lvl1.lengths, metadata.lengths_digest);
439        assert_eq!(lvl1.sequences, metadata.sequences_digest);
440        assert!(lvl1.name_length_pairs.is_some());
441        assert!(lvl1.sorted_name_length_pairs.is_some());
442        assert!(lvl1.sorted_sequences.is_some());
443    }
444
445    #[test]
446    fn test_collection_level2() {
447        let mut store = RefgetStore::in_memory();
448        let (metadata, _) = store
449            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
450            .unwrap();
451
452        let lvl2 = store.get_collection_level2(&metadata.digest).unwrap();
453        assert_eq!(lvl2.names.len(), 3); // chrX, chr1, chr2
454        assert_eq!(lvl2.lengths.len(), 3);
455        assert_eq!(lvl2.sequences.len(), 3);
456
457        // Check that sequences have SQ. prefix
458        for seq in &lvl2.sequences {
459            assert!(seq.starts_with("SQ."), "Expected SQ. prefix, got: {}", seq);
460        }
461
462        // Check lengths match the FASTA data
463        assert!(lvl2.lengths.contains(&8)); // chrX = TTGGGGAA
464        assert!(lvl2.lengths.contains(&4)); // chr1 = GGAA, chr2 = GCGC
465    }
466
467    #[test]
468    fn test_compare_collections() {
469        let mut store = RefgetStore::in_memory();
470        let (meta_a, _) = store
471            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
472            .unwrap();
473        let (meta_b, _) = store
474            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
475            .unwrap();
476
477        // Self-compare: identical digests, all elements same order
478        let self_result = store.compare(&meta_a.digest, &meta_a.digest).unwrap();
479        assert_eq!(Some(self_result.digests.a.as_str()), self_result.digests.b.as_deref());
480        assert_eq!(self_result.attributes.a_and_b.len(), 6); // 3 core + 3 ancillary
481        for attr in &self_result.attributes.a_and_b {
482            assert_eq!(self_result.array_elements.a_and_b_same_order[attr], Some(true));
483        }
484
485        // Cross-compare: different digests, all 6 attributes shared
486        let cross_result = store.compare(&meta_a.digest, &meta_b.digest).unwrap();
487        assert_ne!(Some(cross_result.digests.a.as_str()), cross_result.digests.b.as_deref());
488        assert_eq!(cross_result.attributes.a_and_b.len(), 6);
489        assert!(cross_result.attributes.a_only.is_empty());
490        assert!(cross_result.attributes.b_only.is_empty());
491    }
492
493    #[test]
494    fn test_compare_mixed_ancillary() {
495        let mut store = RefgetStore::in_memory();
496        let (meta_a, _) = store
497            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
498            .unwrap();
499        store.disable_ancillary_digests();
500        let (meta_b, _) = store
501            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
502            .unwrap();
503
504        let result = store.compare(&meta_a.digest, &meta_b.digest).unwrap();
505        assert_eq!(result.attributes.a_and_b.len(), 3);
506        assert_eq!(result.attributes.a_only.len(), 3);
507        assert!(result.attributes.b_only.is_empty());
508    }
509
510    #[test]
511    fn test_find_collections_by_attribute() {
512        let mut store = RefgetStore::in_memory();
513        let (metadata, _) = store
514            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
515            .unwrap();
516
517        // Search by names digest should find our collection
518        let results = store
519            .find_collections_by_attribute("names", &metadata.names_digest)
520            .unwrap();
521        assert_eq!(results.len(), 1);
522        assert_eq!(results[0], metadata.digest);
523
524        // Search by lengths digest
525        let results = store
526            .find_collections_by_attribute("lengths", &metadata.lengths_digest)
527            .unwrap();
528        assert_eq!(results.len(), 1);
529
530        // Search by sequences digest
531        let results = store
532            .find_collections_by_attribute("sequences", &metadata.sequences_digest)
533            .unwrap();
534        assert_eq!(results.len(), 1);
535
536        // Search by ancillary digest
537        let nlp = metadata.name_length_pairs_digest.as_ref().unwrap();
538        let results = store
539            .find_collections_by_attribute("name_length_pairs", nlp)
540            .unwrap();
541        assert_eq!(results.len(), 1);
542
543        // Search with nonexistent digest
544        let results = store
545            .find_collections_by_attribute("names", "nonexistent")
546            .unwrap();
547        assert!(results.is_empty());
548
549        // Unknown attribute should error
550        assert!(store
551            .find_collections_by_attribute("unknown", "digest")
552            .is_err());
553    }
554
555    #[test]
556    fn test_get_attribute() {
557        let mut store = RefgetStore::in_memory();
558        let (metadata, _) = store
559            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
560            .unwrap();
561
562        // Get names array
563        let result = store
564            .get_attribute("names", &metadata.names_digest)
565            .unwrap();
566        assert!(result.is_some());
567        let names = result.unwrap();
568        assert!(names.is_array());
569        assert_eq!(names.as_array().unwrap().len(), 3);
570
571        // Get lengths array
572        let result = store
573            .get_attribute("lengths", &metadata.lengths_digest)
574            .unwrap();
575        assert!(result.is_some());
576
577        // Nonexistent digest returns None
578        let result = store.get_attribute("names", "nonexistent").unwrap();
579        assert!(result.is_none());
580    }
581
582    #[test]
583    fn test_get_attribute_sorted_sequences() {
584        let mut store = RefgetStore::in_memory();
585        let (metadata, _) = store
586            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
587            .unwrap();
588
589        let digest = metadata.sorted_sequences_digest.as_ref().unwrap();
590        let result = store.get_attribute("sorted_sequences", digest).unwrap();
591        assert!(result.is_some());
592        let arr = result.unwrap();
593        assert!(arr.is_array());
594        let items = arr.as_array().unwrap();
595        assert_eq!(items.len(), 3);
596
597        // All items should be strings with SQ. prefix
598        for item in items {
599            let s = item.as_str().unwrap();
600            assert!(s.starts_with("SQ."), "Expected SQ. prefix, got: {}", s);
601        }
602
603        // Should be sorted lexicographically
604        let strings: Vec<&str> = items.iter().map(|v| v.as_str().unwrap()).collect();
605        let mut sorted = strings.clone();
606        sorted.sort();
607        assert_eq!(strings, sorted, "sorted_sequences should be in sorted order");
608    }
609
610    #[test]
611    fn test_get_attribute_name_length_pairs() {
612        let mut store = RefgetStore::in_memory();
613        let (metadata, _) = store
614            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
615            .unwrap();
616
617        let digest = metadata.name_length_pairs_digest.as_ref().unwrap();
618        let result = store.get_attribute("name_length_pairs", digest).unwrap();
619        assert!(result.is_some());
620        let arr = result.unwrap();
621        assert!(arr.is_array());
622        let items = arr.as_array().unwrap();
623        assert_eq!(items.len(), 3);
624
625        // Each item should be an object with "name" (string) and "length" (number) keys
626        for item in items {
627            let obj = item.as_object().unwrap();
628            assert!(obj.contains_key("name"), "Expected 'name' key in object");
629            assert!(obj.contains_key("length"), "Expected 'length' key in object");
630            assert!(obj["name"].is_string(), "name should be a string");
631            assert!(obj["length"].is_number(), "length should be a number");
632        }
633    }
634
635    #[test]
636    fn test_get_attribute_sorted_name_length_pairs() {
637        use crate::digest::algorithms::{canonicalize_json, sha512t24u};
638
639        let mut store = RefgetStore::in_memory();
640        let (metadata, _) = store
641            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
642            .unwrap();
643
644        let digest = metadata.sorted_name_length_pairs_digest.as_ref().unwrap();
645        let result = store
646            .get_attribute("sorted_name_length_pairs", digest)
647            .unwrap();
648        assert!(result.is_some());
649        let arr = result.unwrap();
650        assert!(arr.is_array());
651        let items = arr.as_array().unwrap();
652        assert_eq!(items.len(), 3);
653
654        // Each item should be an object with "name" and "length" keys
655        for item in items {
656            let obj = item.as_object().unwrap();
657            assert!(obj.contains_key("name"));
658            assert!(obj.contains_key("length"));
659        }
660
661        // Verify the objects are sorted by their canonical JSON digest
662        let digests: Vec<String> = items
663            .iter()
664            .map(|v| sha512t24u(canonicalize_json(v).as_bytes()))
665            .collect();
666        let mut sorted_digests = digests.clone();
667        sorted_digests.sort();
668        assert_eq!(
669            digests, sorted_digests,
670            "sorted_name_length_pairs objects should be in sorted digest order"
671        );
672    }
673
674    #[test]
675    fn test_get_attribute_ancillary_not_computed() {
676        let mut store = RefgetStore::in_memory();
677        store.disable_ancillary_digests();
678        let (metadata, _) = store
679            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
680            .unwrap();
681
682        // Ancillary digests not present, so search returns empty, get_attribute returns None
683        assert!(metadata.name_length_pairs_digest.is_none());
684        let result = store
685            .get_attribute("name_length_pairs", "some_digest")
686            .unwrap();
687        assert!(
688            result.is_none(),
689            "Expected None when no ancillary digests are computed"
690        );
691    }
692
693    #[test]
694    fn test_rgci_roundtrip_with_ancillary() {
695        let dir = tempfile::tempdir().unwrap();
696        let dir_path = dir.path();
697        let temp_fasta = copy_test_fasta(dir_path, "base.fa");
698
699        // Create and save a store with ancillary digests
700        {
701            let mut store = RefgetStore::on_disk(dir_path).unwrap();
702            store
703                .add_sequence_collection_from_fasta(&temp_fasta, FastaImportOptions::new())
704                .unwrap();
705            store.write().unwrap();
706        }
707
708        // Reload and verify ancillary digests survived
709        {
710            let store = RefgetStore::open_local(dir_path).unwrap();
711            let collections = store.list_collections(0, usize::MAX, &[]).unwrap();
712            assert_eq!(collections.results.len(), 1);
713
714            let meta = &collections.results[0];
715            assert!(meta.name_length_pairs_digest.is_some());
716            assert!(meta.sorted_name_length_pairs_digest.is_some());
717            assert!(meta.sorted_sequences_digest.is_some());
718        }
719    }
720
721    // ================================================================
722    // Compliance tests against Python refget test_fasta_digests.json
723    // To add new test cases, edit tests/data/fasta/test_fasta_digests.json
724    // and add corresponding .fa files to tests/data/fasta/.
725    // ================================================================
726
727    #[test]
728    fn test_compliance_digests_from_fixture() {
729        let fixture_path = "../tests/data/fasta/test_fasta_digests.json";
730        let fixture_str = std::fs::read_to_string(fixture_path)
731            .unwrap_or_else(|e| panic!("Failed to read {}: {}", fixture_path, e));
732        let fixture: serde_json::Value = serde_json::from_str(&fixture_str)
733            .unwrap_or_else(|e| panic!("Failed to parse {}: {}", fixture_path, e));
734
735        let mut store = RefgetStore::in_memory();
736        store.enable_ancillary_digests();
737
738        for (fa_name, bundle) in fixture.as_object().unwrap() {
739            let fasta_path = format!("../tests/data/fasta/{}", fa_name);
740            let (meta, _) = store
741                .add_sequence_collection_from_fasta(&fasta_path, FastaImportOptions::new())
742                .unwrap_or_else(|e| panic!("{}: {}", fa_name, e));
743
744            let lvl1 = bundle["level1"].as_object().unwrap();
745            let expected_digest = bundle["top_level_digest"].as_str().unwrap();
746
747            assert_eq!(meta.digest, expected_digest, "{}: top_level_digest", fa_name);
748            assert_eq!(meta.names_digest, lvl1["names"].as_str().unwrap(), "{}: names", fa_name);
749            assert_eq!(meta.lengths_digest, lvl1["lengths"].as_str().unwrap(), "{}: lengths", fa_name);
750            assert_eq!(meta.sequences_digest, lvl1["sequences"].as_str().unwrap(), "{}: sequences", fa_name);
751            assert_eq!(
752                meta.sorted_sequences_digest.as_deref(),
753                Some(lvl1["sorted_sequences"].as_str().unwrap()),
754                "{}: sorted_sequences", fa_name
755            );
756            assert_eq!(
757                meta.name_length_pairs_digest.as_deref(),
758                Some(lvl1["name_length_pairs"].as_str().unwrap()),
759                "{}: name_length_pairs", fa_name
760            );
761            assert_eq!(
762                meta.sorted_name_length_pairs_digest.as_deref(),
763                Some(lvl1["sorted_name_length_pairs"].as_str().unwrap()),
764                "{}: sorted_name_length_pairs", fa_name
765            );
766        }
767    }
768
769    #[test]
770    fn test_store_config_persisted() {
771        let dir = tempfile::tempdir().unwrap();
772        let dir_path = dir.path();
773        let temp_fasta = copy_test_fasta(dir_path, "base.fa");
774
775        // Create store with ancillary enabled (default)
776        {
777            let mut store = RefgetStore::on_disk(dir_path).unwrap();
778            assert!(store.has_ancillary_digests());
779            assert!(!store.has_attribute_index());
780            store
781                .add_sequence_collection_from_fasta(&temp_fasta, FastaImportOptions::new())
782                .unwrap();
783            store.write().unwrap();
784        }
785
786        // Reload and verify config
787        {
788            let store = RefgetStore::open_local(dir_path).unwrap();
789            assert!(store.has_ancillary_digests());
790            assert!(!store.has_attribute_index());
791        }
792    }
793
794    // ================================================================
795    // Tests for compare_with_level2
796    // ================================================================
797
798    /// Test 1: compare_with_level2 produces same result as compare when inputs are identical.
799    #[test]
800    fn test_compare_with_level2_self_identical() {
801        let mut store = RefgetStore::in_memory();
802        let (meta, _) = store
803            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
804            .unwrap();
805
806        // Get the level-2 representation of the stored collection
807        let level2 = store.get_collection_level2(&meta.digest).unwrap();
808
809        // Compare the stored collection against its own level-2 body
810        let result = store.compare_with_level2(&meta.digest, &level2).unwrap();
811
812        // digests.a is the stored digest, digests.b is None (no server-side digest for external body)
813        assert_eq!(result.digests.a, meta.digest);
814        assert!(result.digests.b.is_none(), "digests.b should be None for external level-2 comparison");
815
816        // All three core attributes should be shared (a_and_b)
817        assert!(result.attributes.a_and_b.contains(&"names".to_string()));
818        assert!(result.attributes.a_and_b.contains(&"lengths".to_string()));
819        assert!(result.attributes.a_and_b.contains(&"sequences".to_string()));
820
821        // All shared core attributes should be in same order
822        for attr in &["names", "lengths", "sequences"] {
823            assert_eq!(
824                result.array_elements.a_and_b_same_order[*attr],
825                Some(true),
826                "{} should be in same order",
827                attr
828            );
829        }
830    }
831
832    /// Test 2: compare_with_level2 with a different external collection produces equivalent
833    /// result to compare(), except digests.b is None instead of Some(digest_b).
834    #[test]
835    fn test_compare_with_level2_cross_compare() {
836        let mut store = RefgetStore::in_memory();
837        let (meta_a, _) = store
838            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
839            .unwrap();
840        let (meta_b, _) = store
841            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
842            .unwrap();
843
844        // Get level-2 of B
845        let level2_b = store.get_collection_level2(&meta_b.digest).unwrap();
846
847        // compare via digest-digest
848        let compare_result = store.compare(&meta_a.digest, &meta_b.digest).unwrap();
849        // compare via digest + external level-2
850        let with_level2_result = store.compare_with_level2(&meta_a.digest, &level2_b).unwrap();
851
852        // digests.b is None in compare_with_level2 result, Some(digest_b) in compare result
853        assert_eq!(compare_result.digests.b, Some(meta_b.digest.clone()));
854        assert!(with_level2_result.digests.b.is_none());
855
856        // Everything else should match
857        assert_eq!(compare_result.digests.a, with_level2_result.digests.a);
858        // The a_and_b core attributes should agree (both compare the same data for core attributes)
859        for attr in &["names", "lengths", "sequences"] {
860            assert!(
861                with_level2_result.attributes.a_and_b.contains(&attr.to_string())
862                    || with_level2_result.attributes.a_only.contains(&attr.to_string())
863                    || with_level2_result.attributes.b_only.contains(&attr.to_string()),
864                "attr {} must appear somewhere",
865                attr
866            );
867        }
868    }
869
870    /// Test 3: compare_with_level2 with an unknown digest returns an error.
871    #[test]
872    fn test_compare_with_level2_unknown_digest_returns_error() {
873        let store = RefgetStore::in_memory();
874        // Build a minimal CollectionLevel2 body
875        use crate::digest::CollectionLevel2;
876        let level2 = CollectionLevel2 {
877            names: vec!["chr1".to_string()],
878            lengths: vec![100],
879            sequences: vec!["SQ.aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string()],
880        };
881
882        let result = store.compare_with_level2("nonexistent_digest", &level2);
883        assert!(result.is_err(), "Expected error for unknown digest");
884    }
885
886    /// Test 4: ancillary attributes from the stored collection appear in a_only when
887    /// the external level-2 body does not contain them.
888    #[test]
889    fn test_compare_with_level2_ancillary_in_a_only() {
890        let mut store = RefgetStore::in_memory();
891        // Import with ancillary digests enabled (default)
892        let (meta, _) = store
893            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
894            .unwrap();
895
896        // Verify ancillary digests were computed
897        assert!(meta.name_length_pairs_digest.is_some());
898        assert!(meta.sorted_name_length_pairs_digest.is_some());
899        assert!(meta.sorted_sequences_digest.is_some());
900
901        // The level-2 body has only the three core attributes — no ancillary
902        let level2 = store.get_collection_level2(&meta.digest).unwrap();
903
904        let result = store.compare_with_level2(&meta.digest, &level2).unwrap();
905
906        // Ancillary attributes that are in stored collection but not in level-2 body
907        // should appear in a_only
908        assert!(
909            result.attributes.a_only.contains(&"sorted_sequences".to_string()),
910            "sorted_sequences should be in a_only"
911        );
912        assert!(
913            result.attributes.a_only.contains(&"name_length_pairs".to_string()),
914            "name_length_pairs should be in a_only"
915        );
916        assert!(
917            result.attributes.a_only.contains(&"sorted_name_length_pairs".to_string()),
918            "sorted_name_length_pairs should be in a_only"
919        );
920
921        // b_only should be empty (level-2 body only has core attributes)
922        assert!(
923            result.attributes.b_only.is_empty(),
924            "b_only should be empty when level-2 has only core attributes"
925        );
926    }
927
928    // =========================================================================
929    // list_collections pagination and filtering tests
930    // =========================================================================
931
932    #[test]
933    fn test_list_collections_paged_no_filters() {
934        let mut store = RefgetStore::in_memory();
935        store
936            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
937            .unwrap();
938        store
939            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
940            .unwrap();
941
942        let result = store.list_collections(0, 2, &[]).unwrap();
943        assert_eq!(result.results.len(), 2);
944        assert_eq!(result.pagination.total, 2);
945        assert_eq!(result.pagination.page, 0);
946        assert_eq!(result.pagination.page_size, 2);
947        // Results should be sorted by digest
948        assert!(result.results[0].digest <= result.results[1].digest);
949    }
950
951    #[test]
952    fn test_list_collections_paged_second_page() {
953        let mut store = RefgetStore::in_memory();
954        store
955            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
956            .unwrap();
957        store
958            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
959            .unwrap();
960
961        // Page 0 with page_size 1
962        let page0 = store.list_collections(0, 1, &[]).unwrap();
963        assert_eq!(page0.results.len(), 1);
964        assert_eq!(page0.pagination.total, 2);
965
966        // Page 1 with page_size 1
967        let page1 = store.list_collections(1, 1, &[]).unwrap();
968        assert_eq!(page1.results.len(), 1);
969        assert_eq!(page1.pagination.total, 2);
970
971        // Different results on different pages
972        assert_ne!(page0.results[0].digest, page1.results[0].digest);
973    }
974
975    #[test]
976    fn test_list_collections_paged_beyond_end() {
977        let mut store = RefgetStore::in_memory();
978        store
979            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
980            .unwrap();
981
982        let result = store.list_collections(10, 100, &[]).unwrap();
983        assert!(result.results.is_empty());
984        assert_eq!(result.pagination.total, 1);
985    }
986
987    #[test]
988    fn test_list_collections_single_filter() {
989        let mut store = RefgetStore::in_memory();
990        let (meta, _) = store
991            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
992            .unwrap();
993        store
994            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
995            .unwrap();
996
997        let result = store.list_collections(0, 100, &[("names", &meta.names_digest)]).unwrap();
998        assert_eq!(result.results.len(), 1);
999        assert_eq!(result.results[0].digest, meta.digest);
1000        assert_eq!(result.pagination.total, 1);
1001    }
1002
1003    #[test]
1004    fn test_list_collections_multi_filter_and() {
1005        let mut store = RefgetStore::in_memory();
1006        let (meta, _) = store
1007            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1008            .unwrap();
1009        store
1010            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
1011            .unwrap();
1012
1013        // Filter by both names AND lengths -- should match only base.fa
1014        let result = store.list_collections(0, 100, &[
1015            ("names", &meta.names_digest),
1016            ("lengths", &meta.lengths_digest),
1017        ]).unwrap();
1018        assert_eq!(result.results.len(), 1);
1019        assert_eq!(result.results[0].digest, meta.digest);
1020    }
1021
1022    #[test]
1023    fn test_list_collections_filter_no_match() {
1024        let mut store = RefgetStore::in_memory();
1025        store
1026            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1027            .unwrap();
1028
1029        let result = store.list_collections(0, 100, &[("names", "nonexistent_digest")]).unwrap();
1030        assert!(result.results.is_empty());
1031        assert_eq!(result.pagination.total, 0);
1032    }
1033
1034    #[test]
1035    fn test_list_collections_invalid_attribute() {
1036        let mut store = RefgetStore::in_memory();
1037        store
1038            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1039            .unwrap();
1040
1041        let result = store.list_collections(0, 100, &[("unknown_attr", "digest")]);
1042        assert!(result.is_err());
1043    }
1044
1045    #[test]
1046    fn test_list_collections_filter_with_pagination() {
1047        let mut store = RefgetStore::in_memory();
1048        // base.fa and different_names.fa share the same lengths
1049        let (meta_a, _) = store
1050            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1051            .unwrap();
1052        let (_meta_b, _) = store
1053            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
1054            .unwrap();
1055
1056        // Both collections share lengths -- filter by lengths, page_size=1
1057        let page0 = store.list_collections(0, 1, &[("lengths", &meta_a.lengths_digest)]).unwrap();
1058        assert_eq!(page0.results.len(), 1);
1059        assert_eq!(page0.pagination.total, 2); // both match lengths filter
1060
1061        let page1 = store.list_collections(1, 1, &[("lengths", &meta_a.lengths_digest)]).unwrap();
1062        assert_eq!(page1.results.len(), 1);
1063        assert_eq!(page1.pagination.total, 2);
1064
1065        assert_ne!(page0.results[0].digest, page1.results[0].digest);
1066    }
1067
1068    // =========================================================================
1069    // SeqColService trait tests
1070    // =========================================================================
1071
1072    #[test]
1073    fn test_trait_object_safety() {
1074        use super::SeqColService;
1075        use std::sync::Arc;
1076
1077        let mut store = RefgetStore::in_memory();
1078        let (meta, _) = store
1079            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1080            .unwrap();
1081
1082        let readonly = store.into_readonly();
1083        let service: Arc<dyn SeqColService + Send + Sync> = Arc::new(readonly);
1084
1085        // Call each trait method through the trait object
1086        let lvl1 = service.get_collection_level1(&meta.digest).unwrap();
1087        assert_eq!(lvl1.names, meta.names_digest);
1088
1089        let lvl2 = service.get_collection_level2(&meta.digest).unwrap();
1090        assert_eq!(lvl2.names.len(), 3);
1091
1092        let cmp = service.compare(&meta.digest, &meta.digest).unwrap();
1093        assert_eq!(cmp.digests.a, meta.digest);
1094
1095        let cmp2 = service.compare_with_level2(&meta.digest, &lvl2).unwrap();
1096        assert_eq!(cmp2.digests.a, meta.digest);
1097
1098        let found = service
1099            .find_collections_by_attribute("names", &meta.names_digest)
1100            .unwrap();
1101        assert_eq!(found.len(), 1);
1102
1103        let attr = service
1104            .get_attribute("names", &meta.names_digest)
1105            .unwrap();
1106        assert!(attr.is_some());
1107
1108        let paged = service.list_collections(0, 10, &[]).unwrap();
1109        assert_eq!(paged.results.len(), 1);
1110
1111        assert_eq!(service.collection_count(), 1);
1112    }
1113
1114    #[test]
1115    fn test_collection_count() {
1116        let mut store = RefgetStore::in_memory();
1117        store
1118            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1119            .unwrap();
1120        store
1121            .add_sequence_collection_from_fasta("../tests/data/fasta/different_names.fa", FastaImportOptions::new())
1122            .unwrap();
1123
1124        assert_eq!(store.collection_count(), 2);
1125    }
1126
1127    #[test]
1128    fn test_trait_methods_match_concrete() {
1129        use super::SeqColService;
1130
1131        let mut store = RefgetStore::in_memory();
1132        let (meta, _) = store
1133            .add_sequence_collection_from_fasta("../tests/data/fasta/base.fa", FastaImportOptions::new())
1134            .unwrap();
1135
1136        let readonly = store.into_readonly();
1137
1138        // Call via concrete method
1139        let concrete_lvl1 = ReadonlyRefgetStore::get_collection_level1(&readonly, &meta.digest).unwrap();
1140        // Call via trait
1141        let trait_ref: &dyn SeqColService = &readonly;
1142        let trait_lvl1 = trait_ref.get_collection_level1(&meta.digest).unwrap();
1143
1144        assert_eq!(concrete_lvl1.names, trait_lvl1.names);
1145        assert_eq!(concrete_lvl1.lengths, trait_lvl1.lengths);
1146        assert_eq!(concrete_lvl1.sequences, trait_lvl1.sequences);
1147
1148        // list_collections
1149        let concrete_list = ReadonlyRefgetStore::list_collections(&readonly, 0, 10, &[]).unwrap();
1150        let trait_list = trait_ref.list_collections(0, 10, &[]).unwrap();
1151        assert_eq!(concrete_list.results.len(), trait_list.results.len());
1152        assert_eq!(concrete_list.pagination.total, trait_list.pagination.total);
1153
1154        // collection_count
1155        assert_eq!(
1156            ReadonlyRefgetStore::collection_count(&readonly),
1157            trait_ref.collection_count()
1158        );
1159    }
1160}