Skip to main content

refget_model/
seqcol.rs

1//! Sequence Collection types and algorithms for the refget Sequence Collections API.
2
3use std::collections::{BTreeMap, HashSet};
4
5use refget_digest::{digest_json, sha512t24u};
6use serde::{Deserialize, Serialize};
7
8/// The level of detail for a sequence collection response.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)]
10pub enum Level {
11    /// Level 0: a single digest for the entire collection.
12    Zero,
13    /// Level 1: per-attribute digests.
14    One,
15    /// Level 2: full attribute arrays.
16    Two,
17}
18
19impl Level {
20    /// Parse a level from an integer (0, 1, or 2).
21    pub fn from_int(n: u8) -> Option<Self> {
22        match n {
23            0 => Some(Self::Zero),
24            1 => Some(Self::One),
25            2 => Some(Self::Two),
26            _ => None,
27        }
28    }
29}
30
31/// A Level 2 sequence collection: the full attribute arrays.
32#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33pub struct SeqCol {
34    /// Sequence names (e.g. "chr1", "chr2", ...).
35    pub names: Vec<String>,
36    /// Sequence lengths.
37    pub lengths: Vec<u64>,
38    /// GA4GH sha512t24u digests of each sequence.
39    pub sequences: Vec<String>,
40    /// Optional: sorted name-length pairs digest.
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub sorted_name_length_pairs: Option<Vec<String>>,
43}
44
45/// Level 1 representation: per-attribute digests.
46#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
47pub struct SeqColLevel1 {
48    pub names: String,
49    pub lengths: String,
50    pub sequences: String,
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub sorted_name_length_pairs: Option<String>,
53}
54
55impl SeqCol {
56    /// Validate that all arrays have the same length.
57    pub fn validate(&self) -> Result<(), SeqColError> {
58        let n = self.names.len();
59        if self.lengths.len() != n {
60            return Err(SeqColError::MismatchedArrayLengths {
61                expected: n,
62                attribute: "lengths".to_string(),
63                actual: self.lengths.len(),
64            });
65        }
66        if self.sequences.len() != n {
67            return Err(SeqColError::MismatchedArrayLengths {
68                expected: n,
69                attribute: "sequences".to_string(),
70                actual: self.sequences.len(),
71            });
72        }
73        Ok(())
74    }
75
76    /// Compute the Level 0 digest (the single digest for the entire collection).
77    ///
78    /// This is computed from the inherent attributes (names, lengths, sequences)
79    /// by computing per-attribute digests, building a JSON object of those digests,
80    /// canonicalizing it with JCS, and hashing with sha512t24u.
81    pub fn digest(&self) -> String {
82        let level1 = self.to_level1_inherent();
83        let obj = serde_json::json!({
84            "lengths": level1.lengths,
85            "names": level1.names,
86            "sequences": level1.sequences,
87        });
88        digest_json(&obj)
89    }
90
91    /// Compute Level 1: per-attribute digests.
92    pub fn to_level1(&self) -> SeqColLevel1 {
93        let mut level1 = self.to_level1_inherent();
94        level1.sorted_name_length_pairs =
95            Some(digest_string_array(&self.sorted_name_length_pairs()));
96        level1
97    }
98
99    /// Compute Level 1 for inherent attributes only.
100    fn to_level1_inherent(&self) -> SeqColLevel1 {
101        SeqColLevel1 {
102            names: digest_string_array(&self.names),
103            lengths: digest_u64_array(&self.lengths),
104            sequences: digest_string_array(&self.sequences),
105            sorted_name_length_pairs: None,
106        }
107    }
108
109    /// Compute sorted name-length pairs as an array of strings.
110    ///
111    /// Each element is the sha512t24u of `name:length`, sorted lexicographically.
112    pub fn sorted_name_length_pairs(&self) -> Vec<String> {
113        let mut pairs = self.name_length_pairs();
114        pairs.sort();
115        pairs
116    }
117
118    /// Compute name-length pairs (unsorted) as an array of digests.
119    pub fn name_length_pairs(&self) -> Vec<String> {
120        self.names
121            .iter()
122            .zip(self.lengths.iter())
123            .map(|(name, length)| sha512t24u(format!("{name}:{length}").as_bytes()))
124            .collect()
125    }
126
127    /// Return the collection as a JSON value at the specified level.
128    pub fn to_json(&self, level: Level) -> serde_json::Value {
129        match level {
130            Level::Zero => serde_json::Value::String(self.digest()),
131            Level::One => serde_json::to_value(self.to_level1()).unwrap(),
132            Level::Two => {
133                let mut col = self.clone();
134                col.sorted_name_length_pairs = Some(self.sorted_name_length_pairs());
135                serde_json::to_value(col).unwrap()
136            }
137        }
138    }
139}
140
141/// Compare two sequence collections and produce a comparison result.
142pub fn compare(a: &SeqCol, b: &SeqCol) -> ComparisonResult {
143    let a_digest = a.digest();
144    let b_digest = b.digest();
145    // Attribute comparison: both collections have the same inherent attributes
146    let a_and_b: Vec<String> = INHERENT_ATTRIBUTES.iter().map(|s| (*s).to_string()).collect();
147    let a_only: Vec<String> = vec![];
148    let b_only: Vec<String> = vec![];
149
150    // For shared attributes, compute element-level comparison
151    let mut array_elements = BTreeMap::new();
152    for attr in &a_and_b {
153        let (a_vals, b_vals) = get_attribute_strings(a, b, attr);
154        let a_set: HashSet<&str> = a_vals.iter().map(String::as_str).collect();
155        let b_set: HashSet<&str> = b_vals.iter().map(String::as_str).collect();
156
157        let total_a = a_vals.len();
158        let total_b = b_vals.len();
159        let a_and_b_count = a_set.intersection(&b_set).count();
160        let a_only_count = a_set.difference(&b_set).count();
161        let b_only_count = b_set.difference(&a_set).count();
162        let order = if a_vals == b_vals { OrderResult::Match } else { OrderResult::Differ };
163
164        array_elements.insert(
165            attr.clone(),
166            ArrayElementComparison {
167                total_a,
168                total_b,
169                a_and_b: a_and_b_count,
170                a_only: a_only_count,
171                b_only: b_only_count,
172                order,
173            },
174        );
175    }
176
177    ComparisonResult {
178        digests: DigestComparison { a: a_digest, b: b_digest },
179        attributes: AttributeComparison { a_only, b_only, a_and_b },
180        array_elements,
181    }
182}
183
184/// The three inherent attributes of a sequence collection.
185const INHERENT_ATTRIBUTES: &[&str] = &["names", "lengths", "sequences"];
186
187/// Get string representations of attribute values for comparison.
188fn get_attribute_strings(a: &SeqCol, b: &SeqCol, attr: &str) -> (Vec<String>, Vec<String>) {
189    match attr {
190        "names" => (a.names.clone(), b.names.clone()),
191        "lengths" => (
192            a.lengths.iter().map(|v| v.to_string()).collect(),
193            b.lengths.iter().map(|v| v.to_string()).collect(),
194        ),
195        "sequences" => (a.sequences.clone(), b.sequences.clone()),
196        _ => (vec![], vec![]),
197    }
198}
199
200/// The result of comparing two sequence collections.
201#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
202pub struct ComparisonResult {
203    pub digests: DigestComparison,
204    pub attributes: AttributeComparison,
205    pub array_elements: BTreeMap<String, ArrayElementComparison>,
206}
207
208/// Digest information for both collections in a comparison.
209#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
210pub struct DigestComparison {
211    pub a: String,
212    pub b: String,
213}
214
215/// Which attributes exist in each collection.
216#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
217pub struct AttributeComparison {
218    pub a_only: Vec<String>,
219    pub b_only: Vec<String>,
220    pub a_and_b: Vec<String>,
221}
222
223/// Element-level comparison for a single attribute.
224#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
225pub struct ArrayElementComparison {
226    pub total_a: usize,
227    pub total_b: usize,
228    pub a_and_b: usize,
229    pub a_only: usize,
230    pub b_only: usize,
231    pub order: OrderResult,
232}
233
234/// Whether the element order matches between two arrays.
235#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
236#[serde(rename_all = "lowercase")]
237pub enum OrderResult {
238    Match,
239    Differ,
240}
241
242/// Errors from sequence collection operations.
243#[derive(Debug, thiserror::Error)]
244pub enum SeqColError {
245    #[error("Array length mismatch: {attribute} has {actual} elements, expected {expected}")]
246    MismatchedArrayLengths { expected: usize, attribute: String, actual: usize },
247}
248
249/// Compute the sha512t24u digest of an array of strings, by converting to a JSON
250/// array and hashing the canonicalized form.
251fn digest_string_array(values: &[String]) -> String {
252    let json_array: Vec<serde_json::Value> =
253        values.iter().map(|v| serde_json::Value::String(v.clone())).collect();
254    let json = serde_json::Value::Array(json_array);
255    digest_json(&json)
256}
257
258/// Compute the sha512t24u digest of an array of u64 values.
259fn digest_u64_array(values: &[u64]) -> String {
260    let json_array: Vec<serde_json::Value> = values.iter().map(|v| serde_json::json!(v)).collect();
261    let json = serde_json::Value::Array(json_array);
262    digest_json(&json)
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    fn example_seqcol() -> SeqCol {
270        SeqCol {
271            names: vec!["chr1".to_string(), "chr2".to_string()],
272            lengths: vec![248956422, 242193529],
273            sequences: vec![
274                "SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string(),
275                "SQ.v7noePfnNpK8ghYXEqZ9NukMXW0".to_string(),
276            ],
277            sorted_name_length_pairs: None,
278        }
279    }
280
281    #[test]
282    fn test_validate_ok() {
283        let col = example_seqcol();
284        assert!(col.validate().is_ok());
285    }
286
287    #[test]
288    fn test_validate_mismatched_lengths() {
289        let mut col = example_seqcol();
290        col.lengths.push(100);
291        assert!(col.validate().is_err());
292    }
293
294    #[test]
295    fn test_digest_deterministic() {
296        let col = example_seqcol();
297        let d1 = col.digest();
298        let d2 = col.digest();
299        assert_eq!(d1, d2);
300        assert_eq!(d1.len(), 32);
301    }
302
303    #[test]
304    fn test_level1() {
305        let col = example_seqcol();
306        let level1 = col.to_level1();
307        assert_eq!(level1.names.len(), 32);
308        assert_eq!(level1.lengths.len(), 32);
309        assert_eq!(level1.sequences.len(), 32);
310        assert!(level1.sorted_name_length_pairs.is_some());
311    }
312
313    #[test]
314    fn test_sorted_name_length_pairs() {
315        let col = example_seqcol();
316        let pairs = col.sorted_name_length_pairs();
317        assert_eq!(pairs.len(), 2);
318        // Each pair is a sha512t24u digest
319        for p in &pairs {
320            assert_eq!(p.len(), 32);
321        }
322        // Must be sorted
323        assert!(pairs[0] <= pairs[1]);
324    }
325
326    #[test]
327    fn test_compare_identical() {
328        let col = example_seqcol();
329        let result = compare(&col, &col);
330        assert_eq!(result.digests.a, result.digests.b);
331        assert!(result.attributes.a_only.is_empty());
332        assert!(result.attributes.b_only.is_empty());
333        assert_eq!(result.attributes.a_and_b.len(), 3);
334        for elem in result.array_elements.values() {
335            assert_eq!(elem.a_only, 0);
336            assert_eq!(elem.b_only, 0);
337            assert_eq!(elem.order, OrderResult::Match);
338        }
339    }
340
341    #[test]
342    fn test_compare_different() {
343        let a = example_seqcol();
344        let mut b = example_seqcol();
345        b.names[0] = "chrX".to_string();
346        let result = compare(&a, &b);
347        assert_ne!(result.digests.a, result.digests.b);
348        let names_cmp = result.array_elements.get("names").unwrap();
349        assert_eq!(names_cmp.a_only, 1);
350        assert_eq!(names_cmp.b_only, 1);
351    }
352
353    #[test]
354    fn test_to_json_levels() {
355        let col = example_seqcol();
356        let l0 = col.to_json(Level::Zero);
357        assert!(l0.is_string());
358        let l1 = col.to_json(Level::One);
359        assert!(l1.is_object());
360        let l2 = col.to_json(Level::Two);
361        assert!(l2.is_object());
362        assert!(l2.get("names").unwrap().is_array());
363    }
364
365    // --- Level::from_int invalid values ---
366
367    #[test]
368    fn test_level_from_int_invalid_3() {
369        assert!(Level::from_int(3).is_none());
370    }
371
372    #[test]
373    fn test_level_from_int_invalid_255() {
374        assert!(Level::from_int(255).is_none());
375    }
376
377    // --- SeqCol::validate with empty arrays ---
378
379    fn empty_seqcol() -> SeqCol {
380        SeqCol { names: vec![], lengths: vec![], sequences: vec![], sorted_name_length_pairs: None }
381    }
382
383    #[test]
384    fn test_validate_all_empty_ok() {
385        let col = empty_seqcol();
386        assert!(col.validate().is_ok());
387    }
388
389    #[test]
390    fn test_validate_sequences_length_mismatch() {
391        let mut col = example_seqcol();
392        col.sequences.push("SQ.extra".to_string());
393        let err = col.validate().unwrap_err();
394        let msg = err.to_string();
395        assert!(msg.contains("sequences"), "error should mention 'sequences': {msg}");
396    }
397
398    // --- name_length_pairs output ---
399
400    #[test]
401    fn test_name_length_pairs_length_and_digest_size() {
402        let col = example_seqcol();
403        let pairs = col.name_length_pairs();
404        assert_eq!(pairs.len(), 2);
405        for p in &pairs {
406            assert_eq!(p.len(), 32, "each name-length pair digest should be 32 chars");
407        }
408    }
409
410    // --- compare: completely different collections ---
411
412    #[test]
413    fn test_compare_no_overlap() {
414        let a = example_seqcol();
415        let b = SeqCol {
416            names: vec!["chrX".to_string(), "chrY".to_string()],
417            lengths: vec![1000, 2000],
418            sequences: vec![
419                "SQ.aaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string(),
420                "SQ.bbbbbbbbbbbbbbbbbbbbbbbbbbbb".to_string(),
421            ],
422            sorted_name_length_pairs: None,
423        };
424        let result = compare(&a, &b);
425        assert_ne!(result.digests.a, result.digests.b);
426        for elem in result.array_elements.values() {
427            assert_eq!(elem.a_and_b, 0, "no elements should overlap");
428            assert_eq!(elem.a_only, elem.total_a);
429            assert_eq!(elem.b_only, elem.total_b);
430        }
431    }
432
433    // --- compare: different-length collections ---
434
435    #[test]
436    fn test_compare_different_lengths() {
437        let a = example_seqcol();
438        let b = SeqCol {
439            names: vec!["chr1".to_string()],
440            lengths: vec![248956422],
441            sequences: vec!["SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string()],
442            sorted_name_length_pairs: None,
443        };
444        let result = compare(&a, &b);
445        let names_cmp = result.array_elements.get("names").unwrap();
446        assert_eq!(names_cmp.total_a, 2);
447        assert_eq!(names_cmp.total_b, 1);
448        assert_eq!(names_cmp.a_and_b, 1);
449        assert_eq!(names_cmp.a_only, 1);
450        assert_eq!(names_cmp.b_only, 0);
451    }
452
453    // --- compare: same elements, different order ---
454
455    #[test]
456    fn test_compare_same_elements_different_order() {
457        let a = example_seqcol();
458        let b = SeqCol {
459            names: vec!["chr2".to_string(), "chr1".to_string()],
460            lengths: vec![242193529, 248956422],
461            sequences: vec![
462                "SQ.v7noePfnNpK8ghYXEqZ9NukMXW0".to_string(),
463                "SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string(),
464            ],
465            sorted_name_length_pairs: None,
466        };
467        let result = compare(&a, &b);
468        // Digests differ because order matters for the level-0 digest
469        assert_ne!(result.digests.a, result.digests.b);
470        for elem in result.array_elements.values() {
471            assert_eq!(elem.order, OrderResult::Differ, "order should differ");
472            assert_eq!(elem.a_and_b, elem.total_a, "all elements of a should be in b");
473            assert_eq!(elem.a_and_b, elem.total_b, "all elements of b should be in a");
474            assert_eq!(elem.a_only, 0);
475            assert_eq!(elem.b_only, 0);
476        }
477    }
478
479    // --- to_json Level::Zero returns a JSON string ---
480
481    #[test]
482    fn test_to_json_level_zero_is_string() {
483        let col = example_seqcol();
484        let json = col.to_json(Level::Zero);
485        assert!(json.is_string(), "Level::Zero JSON should be a string");
486        assert_eq!(json.as_str().unwrap().len(), 32, "Level::Zero digest should be 32 chars");
487    }
488
489    // --- to_json Level::Two includes sorted_name_length_pairs ---
490
491    #[test]
492    fn test_to_json_level_two_has_sorted_name_length_pairs() {
493        let col = example_seqcol();
494        let json = col.to_json(Level::Two);
495        let snlp = json.get("sorted_name_length_pairs");
496        assert!(snlp.is_some(), "Level::Two should include sorted_name_length_pairs");
497        assert!(snlp.unwrap().is_array());
498    }
499
500    // --- empty collections produce valid 32-char digests (exercises digest_string_array / digest_u64_array) ---
501
502    #[test]
503    fn test_empty_collection_digests_are_valid() {
504        let col = empty_seqcol();
505        // digest exercises digest_string_array (names, sequences) and digest_u64_array (lengths)
506        let d = col.digest();
507        assert_eq!(d.len(), 32, "digest of empty collection should be 32 chars");
508
509        let level1 = col.to_level1();
510        assert_eq!(level1.names.len(), 32);
511        assert_eq!(level1.lengths.len(), 32);
512        assert_eq!(level1.sequences.len(), 32);
513        // names and sequences are both empty string arrays, so their digests should be equal
514        assert_eq!(level1.names, level1.sequences);
515    }
516
517    // --- single-element SeqCol: validate, digest, level1 ---
518
519    #[test]
520    fn test_single_element_seqcol() {
521        let col = SeqCol {
522            names: vec!["chrM".to_string()],
523            lengths: vec![16569],
524            sequences: vec!["SQ.someDigest_chrM_placeholder00".to_string()],
525            sorted_name_length_pairs: None,
526        };
527        assert!(col.validate().is_ok());
528
529        let d = col.digest();
530        assert_eq!(d.len(), 32);
531
532        let level1 = col.to_level1();
533        assert_eq!(level1.names.len(), 32);
534        assert_eq!(level1.lengths.len(), 32);
535        assert_eq!(level1.sequences.len(), 32);
536        assert!(level1.sorted_name_length_pairs.is_some());
537        assert_eq!(level1.sorted_name_length_pairs.unwrap().len(), 32);
538    }
539}