gtars_refget/digest/
types.rs

1//! Core types for sequence collections - WASM-safe.
2//!
3//! This module contains the fundamental data structures for representing sequences
4//! and sequence collections. All types here are WASM-compatible and don't require
5//! filesystem access.
6
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt::Display;
10use std::path::PathBuf;
11
12use super::algorithms::{canonicalize_json, md5, sha512t24u};
13use super::alphabet::{AlphabetType, guess_alphabet};
14
15/// Metadata for a single sequence, including its name, length, digests, and alphabet type.
16#[derive(Clone, Debug, Serialize, Deserialize)]
17pub struct SequenceMetadata {
18    pub name: String,
19    /// Description from FASTA header (text after first whitespace).
20    #[serde(default)]
21    pub description: Option<String>,
22    pub length: usize,
23    pub sha512t24u: String,
24    pub md5: String,
25    pub alphabet: AlphabetType,
26    pub fai: Option<FaiMetadata>,
27}
28
29impl Default for SequenceMetadata {
30    fn default() -> Self {
31        Self {
32            name: String::new(),
33            description: None,
34            length: 0,
35            sha512t24u: String::new(),
36            md5: String::new(),
37            alphabet: AlphabetType::Ascii,
38            fai: None,
39        }
40    }
41}
42
43/// FASTA index (FAI) metadata for a sequence.
44/// This data is only present when a sequence was loaded from a FASTA file.
45#[derive(Clone, Debug, Serialize, Deserialize)]
46pub struct FaiMetadata {
47    pub offset: u64,     // byte offset to first base of sequence data
48    pub line_bases: u32, // number of bases per line
49    pub line_bytes: u32, // number of bytes per line (including newline chars)
50}
51
52/// A representation of a single sequence that includes metadata and optionally data.
53/// Combines sequence metadata with optional raw/encoded data.
54///
55/// This enum has two variants:
56/// - `Stub`: Contains only metadata, no sequence data loaded
57/// - `Full`: Contains both metadata and the actual sequence data
58#[derive(Clone, Debug)]
59pub enum SequenceRecord {
60    /// A sequence record with only metadata, no sequence data
61    Stub(SequenceMetadata),
62    /// A sequence record with both metadata and sequence data
63    Full {
64        metadata: SequenceMetadata,
65        sequence: Vec<u8>,
66    },
67}
68
69impl SequenceRecord {
70    /// Get metadata regardless of variant
71    pub fn metadata(&self) -> &SequenceMetadata {
72        match self {
73            SequenceRecord::Stub(meta) => meta,
74            SequenceRecord::Full { metadata, .. } => metadata,
75        }
76    }
77
78    /// Get sequence data if present
79    pub fn sequence(&self) -> Option<&[u8]> {
80        match self {
81            SequenceRecord::Stub(_) => None,
82            SequenceRecord::Full { sequence, .. } => Some(sequence),
83        }
84    }
85
86    /// Check if sequence data is loaded (Full) or just metadata (Stub).
87    pub fn is_loaded(&self) -> bool {
88        matches!(self, SequenceRecord::Full { .. })
89    }
90
91    /// Load data into a Stub record, or replace data in a Full record (takes ownership)
92    pub fn with_data(self, sequence: Vec<u8>) -> Self {
93        let metadata = match self {
94            SequenceRecord::Stub(m) => m,
95            SequenceRecord::Full { metadata, .. } => metadata,
96        };
97        SequenceRecord::Full { metadata, sequence }
98    }
99
100    /// Load data into a Stub record in-place, converting it to Full.
101    /// If already Full, replaces the existing sequence data.
102    ///
103    /// This is more efficient than `with_data()` when you have a mutable reference,
104    /// as it avoids cloning the metadata.
105    pub fn load_data(&mut self, sequence: Vec<u8>) {
106        match self {
107            SequenceRecord::Stub(metadata) => {
108                // Take ownership of metadata without cloning
109                let metadata = std::mem::take(metadata);
110                *self = SequenceRecord::Full { metadata, sequence };
111            }
112            SequenceRecord::Full {
113                sequence: existing, ..
114            } => {
115                // Just replace the sequence data
116                *existing = sequence;
117            }
118        }
119    }
120
121    /// Decodes the sequence data to a string.
122    ///
123    /// This method attempts to decode the sequence data stored in this record.
124    /// It handles both raw (uncompressed UTF-8) and encoded (bit-packed) data.
125    /// The decoding strategy depends on the alphabet type:
126    /// - For ASCII alphabet: data is already in raw form, just convert to string
127    /// - For other alphabets: attempt encoded decoding first, fall back to raw
128    ///
129    /// # Returns
130    ///
131    /// * `Some(String)` - The decoded sequence if data is loaded
132    /// * `None` - If no data is loaded in this record
133    pub fn decode(&self) -> Option<String> {
134        use super::alphabet::lookup_alphabet;
135        use super::encoder::decode_substring_from_bytes;
136
137        let (metadata, data) = match self {
138            SequenceRecord::Stub(_) => return None,
139            SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
140        };
141
142        // For ASCII alphabet (8 bits per symbol), the data is always stored raw
143        if metadata.alphabet == AlphabetType::Ascii {
144            return String::from_utf8(data.clone()).ok();
145        }
146
147        // Try to detect if data is raw or encoded
148        // Heuristic: for encoded data, the size should be approximately length * bits_per_symbol / 8
149        // For raw data, the size should be approximately equal to length
150        let alphabet = lookup_alphabet(&metadata.alphabet);
151
152        // If data size matches the expected length (not the encoded size), it's probably raw
153        if data.len() == metadata.length {
154            // Try to decode as UTF-8
155            if let Ok(raw_string) = String::from_utf8(data.clone()) {
156                // Data appears to be raw UTF-8
157                return Some(raw_string);
158            }
159        }
160
161        // Data is probably encoded (size matches expected encoded size), try to decode it
162        let decoded_bytes = decode_substring_from_bytes(data, 0, metadata.length, alphabet);
163
164        // Convert to string
165        String::from_utf8(decoded_bytes).ok()
166    }
167}
168
169impl Display for SequenceRecord {
170    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171        write!(
172            f,
173            "SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})",
174            &self.metadata().name,
175            &self.metadata().length,
176            &self.metadata().alphabet,
177            &self.metadata().sha512t24u,
178            &self.metadata().md5
179        )?;
180        Ok(())
181    }
182}
183
184/// A struct representing the first level of digests for a refget sequence collection.
185#[derive(Debug, Serialize, Deserialize, Clone)]
186pub struct SeqColDigestLvl1 {
187    pub sequences_digest: String,
188    pub names_digest: String,
189    pub lengths_digest: String,
190}
191
192impl SeqColDigestLvl1 {
193    /// Compute collection digest from lvl1 digests
194    pub fn to_digest(&self) -> String {
195        // Create JSON object with the lvl1 digest strings
196        let mut lvl1_object = serde_json::Map::new();
197        lvl1_object.insert(
198            "names".to_string(),
199            serde_json::Value::String(self.names_digest.clone()),
200        );
201        lvl1_object.insert(
202            "sequences".to_string(),
203            serde_json::Value::String(self.sequences_digest.clone()),
204        );
205
206        let lvl1_json = serde_json::Value::Object(lvl1_object);
207
208        // Canonicalize the JSON object and compute collection digest
209        let lvl1_canonical = canonicalize_json(&lvl1_json);
210        sha512t24u(lvl1_canonical.as_bytes())
211    }
212
213    /// Compute lvl1 digests from a collection of SequenceMetadata
214    pub fn from_metadata(metadata_vec: &[&SequenceMetadata]) -> Self {
215        use serde_json::Value;
216
217        // Extract arrays for each field
218        let sequences: Vec<String> = metadata_vec
219            .iter()
220            .map(|md| format!("SQ.{}", md.sha512t24u))
221            .collect();
222        let names: Vec<&str> = metadata_vec.iter().map(|md| md.name.as_str()).collect();
223        let lengths: Vec<usize> = metadata_vec.iter().map(|md| md.length).collect();
224
225        // Convert to JSON Values and canonicalize
226        let sequences_json = Value::Array(
227            sequences
228                .iter()
229                .map(|s| Value::String(s.to_string()))
230                .collect(),
231        );
232        let names_json = Value::Array(names.iter().map(|s| Value::String(s.to_string())).collect());
233        let lengths_json = Value::Array(
234            lengths
235                .iter()
236                .map(|l| Value::Number(serde_json::Number::from(*l)))
237                .collect(),
238        );
239
240        // Canonicalize to JCS format
241        let sequences_canonical = canonicalize_json(&sequences_json);
242        let names_canonical = canonicalize_json(&names_json);
243        let lengths_canonical = canonicalize_json(&lengths_json);
244
245        // Hash the canonicalized arrays
246        SeqColDigestLvl1 {
247            sequences_digest: sha512t24u(sequences_canonical.as_bytes()),
248            names_digest: sha512t24u(names_canonical.as_bytes()),
249            lengths_digest: sha512t24u(lengths_canonical.as_bytes()),
250        }
251    }
252
253    /// Compute name_length_pairs digest.
254    ///
255    /// Algorithm: for each sequence, create {"length": L, "name": "N"},
256    /// canonicalize each to JSON, digest each, collect into array,
257    /// canonicalize array, digest array.
258    pub fn compute_name_length_pairs_digest(metadata: &[&SequenceMetadata]) -> String {
259        use serde_json::Value;
260
261        // Build array of {"length": N, "name": "X"} pair objects
262        let pairs: Vec<Value> = metadata
263            .iter()
264            .map(|md| {
265                let mut obj = serde_json::Map::new();
266                obj.insert(
267                    "length".to_string(),
268                    Value::Number(serde_json::Number::from(md.length)),
269                );
270                obj.insert("name".to_string(), Value::String(md.name.clone()));
271                Value::Object(obj)
272            })
273            .collect();
274
275        // Canonicalize the entire array of pair objects, then digest
276        let canonical = canonicalize_json(&Value::Array(pairs));
277        sha512t24u(canonical.as_bytes())
278    }
279
280    /// Compute sorted_name_length_pairs digest (order-invariant coordinate system).
281    ///
282    /// Algorithm: same as name_length_pairs but sort the individual pair digests
283    /// lexicographically before digesting the array.
284    pub fn compute_sorted_name_length_pairs_digest(metadata: &[&SequenceMetadata]) -> String {
285        use serde_json::Value;
286
287        let mut pair_digests: Vec<String> = metadata
288            .iter()
289            .map(|md| {
290                let mut obj = serde_json::Map::new();
291                obj.insert(
292                    "length".to_string(),
293                    Value::Number(serde_json::Number::from(md.length)),
294                );
295                obj.insert("name".to_string(), Value::String(md.name.clone()));
296                let canonical = canonicalize_json(&Value::Object(obj));
297                sha512t24u(canonical.as_bytes())
298            })
299            .collect();
300
301        pair_digests.sort();
302
303        let array_json = Value::Array(
304            pair_digests
305                .iter()
306                .map(|d| Value::String(d.clone()))
307                .collect(),
308        );
309        let canonical = canonicalize_json(&array_json);
310        sha512t24u(canonical.as_bytes())
311    }
312
313    /// Compute sorted_sequences digest.
314    ///
315    /// Algorithm: take sequences array (with SQ. prefix), sort lexicographically,
316    /// canonicalize, digest.
317    pub fn compute_sorted_sequences_digest(metadata: &[&SequenceMetadata]) -> String {
318        use serde_json::Value;
319
320        let mut sequences: Vec<String> = metadata
321            .iter()
322            .map(|md| format!("SQ.{}", md.sha512t24u))
323            .collect();
324
325        sequences.sort();
326
327        let array_json = Value::Array(
328            sequences
329                .iter()
330                .map(|s| Value::String(s.clone()))
331                .collect(),
332        );
333        let canonical = canonicalize_json(&array_json);
334        sha512t24u(canonical.as_bytes())
335    }
336}
337
338/// Metadata for a sequence collection (parallel to SequenceMetadata).
339/// Contains the collection digest and level 1 digests for names, sequences, and lengths.
340#[derive(Clone, Debug, Serialize, Deserialize)]
341pub struct SequenceCollectionMetadata {
342    /// Top-level seqcol digest
343    pub digest: String,
344    /// Number of sequences in the collection
345    pub n_sequences: usize,
346    /// Level 1 digest of names array
347    pub names_digest: String,
348    /// Level 1 digest of sequences array
349    pub sequences_digest: String,
350    /// Level 1 digest of lengths array
351    pub lengths_digest: String,
352    /// Ancillary: digest of name_length_pairs array
353    #[serde(default, skip_serializing_if = "Option::is_none")]
354    pub name_length_pairs_digest: Option<String>,
355    /// Ancillary: digest of sorted_name_length_pairs array (order-invariant coordinate system)
356    #[serde(default, skip_serializing_if = "Option::is_none")]
357    pub sorted_name_length_pairs_digest: Option<String>,
358    /// Ancillary: digest of sorted sequences array
359    #[serde(default, skip_serializing_if = "Option::is_none")]
360    pub sorted_sequences_digest: Option<String>,
361    /// Optional path to the source file
362    pub file_path: Option<PathBuf>,
363}
364
365impl SequenceCollectionMetadata {
366    /// Compute metadata from sequence records (core digests only).
367    pub fn from_sequences(
368        sequences: &[SequenceRecord],
369        file_path: Option<PathBuf>,
370    ) -> Self {
371        // Extract metadata refs
372        let metadata_refs: Vec<&SequenceMetadata> =
373            sequences.iter().map(|r| r.metadata()).collect();
374
375        // Compute level 1 digests
376        let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs);
377
378        // Compute top-level digest from level 1 digests
379        let digest = lvl1.to_digest();
380
381        Self {
382            digest,
383            n_sequences: sequences.len(),
384            names_digest: lvl1.names_digest,
385            sequences_digest: lvl1.sequences_digest,
386            lengths_digest: lvl1.lengths_digest,
387            name_length_pairs_digest: None,
388            sorted_name_length_pairs_digest: None,
389            sorted_sequences_digest: None,
390            file_path,
391        }
392    }
393
394    /// Compute ancillary digests (name_length_pairs, sorted_name_length_pairs,
395    /// sorted_sequences) from sequence records. No-op if already computed.
396    pub fn compute_ancillary_digests(&mut self, sequences: &[SequenceRecord]) {
397        if self.name_length_pairs_digest.is_some() {
398            return;
399        }
400        let metadata_refs: Vec<&SequenceMetadata> =
401            sequences.iter().map(|r| r.metadata()).collect();
402        self.name_length_pairs_digest =
403            Some(SeqColDigestLvl1::compute_name_length_pairs_digest(&metadata_refs));
404        self.sorted_name_length_pairs_digest =
405            Some(SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&metadata_refs));
406        self.sorted_sequences_digest =
407            Some(SeqColDigestLvl1::compute_sorted_sequences_digest(&metadata_refs));
408    }
409
410    /// Create from an existing SequenceCollection
411    pub fn from_collection(collection: &SequenceCollection) -> Self {
412        collection.metadata.clone()
413    }
414
415    /// Convert to SeqColDigestLvl1 for compatibility
416    pub fn to_lvl1(&self) -> SeqColDigestLvl1 {
417        SeqColDigestLvl1 {
418            sequences_digest: self.sequences_digest.clone(),
419            names_digest: self.names_digest.clone(),
420            lengths_digest: self.lengths_digest.clone(),
421        }
422    }
423
424    /// Return level 1 representation (attribute digests with spec-compliant field names).
425    pub fn to_level1(&self) -> CollectionLevel1 {
426        CollectionLevel1 {
427            names: self.names_digest.clone(),
428            lengths: self.lengths_digest.clone(),
429            sequences: self.sequences_digest.clone(),
430            name_length_pairs: self.name_length_pairs_digest.clone(),
431            sorted_name_length_pairs: self.sorted_name_length_pairs_digest.clone(),
432            sorted_sequences: self.sorted_sequences_digest.clone(),
433        }
434    }
435}
436
437/// Level 1 representation: attribute digests with spec-compliant JSON field names.
438#[derive(Debug, Clone, Serialize, Deserialize)]
439pub struct CollectionLevel1 {
440    pub names: String,
441    pub lengths: String,
442    pub sequences: String,
443    #[serde(skip_serializing_if = "Option::is_none")]
444    pub name_length_pairs: Option<String>,
445    #[serde(skip_serializing_if = "Option::is_none")]
446    pub sorted_name_length_pairs: Option<String>,
447    #[serde(skip_serializing_if = "Option::is_none")]
448    pub sorted_sequences: Option<String>,
449}
450
451/// Level 2 representation: full arrays with spec-compliant JSON field names.
452/// Sequences include SQ. prefix per spec.
453#[derive(Debug, Clone, Serialize, Deserialize)]
454pub struct CollectionLevel2 {
455    pub names: Vec<String>,
456    pub lengths: Vec<usize>,
457    /// Sequence digests with SQ. prefix per spec
458    pub sequences: Vec<String>,
459}
460
461/// Result of comparing two sequence collections.
462#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct SeqColComparison {
464    pub digests: ComparisonDigests,
465    pub attributes: AttributeComparison,
466    pub array_elements: ArrayElementComparison,
467}
468
469/// The digests of the two compared collections.
470/// `b` is `None` when comparing against an externally-provided level-2 body
471/// (i.e., the external collection has no server-side digest).
472#[derive(Debug, Clone, Serialize, Deserialize)]
473pub struct ComparisonDigests {
474    pub a: String,
475    pub b: Option<String>,
476}
477
478/// Which attributes (array names) are in A only, B only, or both.
479#[derive(Debug, Clone, Serialize, Deserialize)]
480pub struct AttributeComparison {
481    pub a_only: Vec<String>,
482    pub b_only: Vec<String>,
483    pub a_and_b: Vec<String>,
484}
485
486/// Element-level comparison for each shared attribute.
487#[derive(Debug, Clone, Serialize, Deserialize)]
488pub struct ArrayElementComparison {
489    pub a_count: HashMap<String, usize>,
490    pub b_count: HashMap<String, usize>,
491    pub a_and_b_count: HashMap<String, usize>,
492    pub a_and_b_same_order: HashMap<String, Option<bool>>,
493}
494
495/// A single Sequence Collection, which may or may not hold data.
496#[derive(Clone, Debug)]
497pub struct SequenceCollection {
498    /// Collection metadata (digest, level 1 digests, n_sequences, file_path)
499    pub metadata: SequenceCollectionMetadata,
500
501    /// Vector of SequenceRecords, which contain metadata (name, length, digests, alphabet)
502    /// and optionally the actual sequence data.
503    pub sequences: Vec<SequenceRecord>,
504}
505
506impl SequenceCollection {
507    /// Create a SequenceCollection from a vector of SequenceRecords.
508    pub fn from_records(records: Vec<SequenceRecord>) -> Self {
509        // Compute metadata from the sequence records (with ancillary digests)
510        let metadata = SequenceCollectionMetadata::from_sequences(&records, None);
511
512        SequenceCollection {
513            metadata,
514            sequences: records,
515        }
516    }
517
518    /// Return level 2 representation (full arrays, spec format).
519    /// Transposes Vec<SequenceRecord> into parallel arrays.
520    pub fn to_level2(&self) -> CollectionLevel2 {
521        let names: Vec<String> = self
522            .sequences
523            .iter()
524            .map(|r| r.metadata().name.clone())
525            .collect();
526        let lengths: Vec<usize> = self.sequences.iter().map(|r| r.metadata().length).collect();
527        let sequences: Vec<String> = self
528            .sequences
529            .iter()
530            .map(|r| format!("SQ.{}", r.metadata().sha512t24u))
531            .collect();
532
533        CollectionLevel2 {
534            names,
535            lengths,
536            sequences,
537        }
538    }
539
540    /// Build the sorted_sequences array as SQ.-prefixed digest strings, sorted lexicographically.
541    pub fn build_sorted_sequences(&self) -> Vec<String> {
542        let mut seqs: Vec<String> = self
543            .sequences
544            .iter()
545            .map(|r| format!("SQ.{}", r.metadata().sha512t24u))
546            .collect();
547        seqs.sort();
548        seqs
549    }
550
551    /// Build the name_length_pairs array as JSON value objects {"length": N, "name": "X"}.
552    /// Keys are in alphabetical order per canonical JSON.
553    pub fn build_name_length_pairs(&self) -> Vec<serde_json::Value> {
554        self.sequences
555            .iter()
556            .map(|r| {
557                let md = r.metadata();
558                let mut obj = serde_json::Map::new();
559                obj.insert(
560                    "length".to_string(),
561                    serde_json::Value::Number(serde_json::Number::from(md.length)),
562                );
563                obj.insert(
564                    "name".to_string(),
565                    serde_json::Value::String(md.name.clone()),
566                );
567                serde_json::Value::Object(obj)
568            })
569            .collect()
570    }
571
572    /// Build the sorted_name_length_pairs array as JSON value objects, sorted by their
573    /// canonical JSON digest.
574    pub fn build_sorted_name_length_pairs(&self) -> Vec<serde_json::Value> {
575        let mut pairs_with_digests: Vec<(String, serde_json::Value)> = self
576            .sequences
577            .iter()
578            .map(|r| {
579                let md = r.metadata();
580                let mut obj = serde_json::Map::new();
581                obj.insert(
582                    "length".to_string(),
583                    serde_json::Value::Number(serde_json::Number::from(md.length)),
584                );
585                obj.insert(
586                    "name".to_string(),
587                    serde_json::Value::String(md.name.clone()),
588                );
589                let val = serde_json::Value::Object(obj);
590                let digest = sha512t24u(canonicalize_json(&val).as_bytes());
591                (digest, val)
592            })
593            .collect();
594        pairs_with_digests.sort_by(|a, b| a.0.cmp(&b.0));
595        pairs_with_digests.into_iter().map(|(_, v)| v).collect()
596    }
597
598    /// Compare this collection with another, following the seqcol spec comparison algorithm.
599    ///
600    /// Dynamically includes ancillary attributes (name_length_pairs, sorted_name_length_pairs,
601    /// sorted_sequences) when present in each collection's metadata.
602    pub fn compare(&self, other: &SequenceCollection) -> SeqColComparison {
603        let arrays_a = self.to_comparison_arrays();
604        let arrays_b = other.to_comparison_arrays();
605        compare_arrays(
606            arrays_a,
607            arrays_b,
608            self.metadata.digest.clone(),
609            Some(other.metadata.digest.clone()),
610        )
611    }
612
613    /// Build string arrays for comparison, including ancillary attributes when present.
614    pub(crate) fn to_comparison_arrays(&self) -> HashMap<String, Vec<String>> {
615        let mut map = HashMap::new();
616
617        // Core 3 are always present
618        map.insert(
619            "names".to_string(),
620            self.sequences.iter().map(|r| r.metadata().name.clone()).collect(),
621        );
622        map.insert(
623            "lengths".to_string(),
624            self.sequences.iter().map(|r| r.metadata().length.to_string()).collect(),
625        );
626        map.insert(
627            "sequences".to_string(),
628            self.sequences.iter().map(|r| format!("SQ.{}", r.metadata().sha512t24u)).collect(),
629        );
630
631        // Ancillary: only include if this collection has them computed
632        if self.metadata.sorted_sequences_digest.is_some() {
633            // sorted_sequences: sort the SQ.-prefixed digests
634            let mut sorted_seqs: Vec<String> = self.sequences
635                .iter()
636                .map(|r| format!("SQ.{}", r.metadata().sha512t24u))
637                .collect();
638            sorted_seqs.sort();
639            map.insert("sorted_sequences".to_string(), sorted_seqs);
640        }
641
642        if self.metadata.name_length_pairs_digest.is_some() {
643            // name_length_pairs: canonical JSON of each {length, name} pair
644            let nlp: Vec<String> = self.sequences
645                .iter()
646                .map(|r| {
647                    let md = r.metadata();
648                    let mut obj = serde_json::Map::new();
649                    obj.insert("length".to_string(), serde_json::Value::Number(serde_json::Number::from(md.length)));
650                    obj.insert("name".to_string(), serde_json::Value::String(md.name.clone()));
651                    canonicalize_json(&serde_json::Value::Object(obj))
652                })
653                .collect();
654            map.insert("name_length_pairs".to_string(), nlp);
655        }
656
657        if self.metadata.sorted_name_length_pairs_digest.is_some() {
658            // sorted_name_length_pairs: digest each pair, sort the digests
659            let mut snlp: Vec<String> = self.sequences
660                .iter()
661                .map(|r| {
662                    let md = r.metadata();
663                    let mut obj = serde_json::Map::new();
664                    obj.insert("length".to_string(), serde_json::Value::Number(serde_json::Number::from(md.length)));
665                    obj.insert("name".to_string(), serde_json::Value::String(md.name.clone()));
666                    sha512t24u(canonicalize_json(&serde_json::Value::Object(obj)).as_bytes())
667                })
668                .collect();
669            snlp.sort();
670            map.insert("sorted_name_length_pairs".to_string(), snlp);
671        }
672
673        map
674    }
675}
676
677/// Compare two arrays element-wise following the seqcol spec.
678/// Returns (overlap_count, same_order).
679///
680/// Port of Python `_compare_elements()`:
681/// - Filter A to elements present in B, filter B to elements present in A
682/// - overlap = min of filtered lengths
683/// - same_order: None if fewer than 2 overlapping elements or unbalanced duplicates,
684///   otherwise filtered_a == filtered_b
685fn compare_elements(a: &[String], b: &[String]) -> (usize, Option<bool>) {
686    use std::collections::HashSet;
687
688    let set_a: HashSet<&str> = a.iter().map(|s| s.as_str()).collect();
689    let set_b: HashSet<&str> = b.iter().map(|s| s.as_str()).collect();
690
691    // Filter each to elements present in the other
692    let filtered_a: Vec<&str> = a.iter().filter(|x| set_b.contains(x.as_str())).map(|s| s.as_str()).collect();
693    let filtered_b: Vec<&str> = b.iter().filter(|x| set_a.contains(x.as_str())).map(|s| s.as_str()).collect();
694
695    let overlap = filtered_a.len().min(filtered_b.len());
696
697    let same_order = if overlap < 2 {
698        None
699    } else if filtered_a.len() != filtered_b.len() || filtered_a.len() != overlap {
700        // Unbalanced duplicates
701        None
702    } else {
703        Some(filtered_a == filtered_b)
704    };
705
706    (overlap, same_order)
707}
708
709/// Core comparison algorithm operating on pre-built attribute array maps.
710/// Used by both `SequenceCollection::compare()` and the external-body comparison path.
711pub(crate) fn compare_arrays(
712    arrays_a: HashMap<String, Vec<String>>,
713    arrays_b: HashMap<String, Vec<String>>,
714    digest_a: String,
715    digest_b: Option<String>,
716) -> SeqColComparison {
717    let a_keys: std::collections::BTreeSet<&str> = arrays_a.keys().map(|s| s.as_str()).collect();
718    let b_keys: std::collections::BTreeSet<&str> = arrays_b.keys().map(|s| s.as_str()).collect();
719
720    let mut a_only = Vec::new();
721    let mut b_only = Vec::new();
722    let mut a_and_b = Vec::new();
723
724    let mut all_keys: Vec<&str> = a_keys.union(&b_keys).copied().collect();
725    all_keys.sort();
726
727    for key in all_keys {
728        let in_a = a_keys.contains(key);
729        let in_b = b_keys.contains(key);
730        match (in_a, in_b) {
731            (true, true) => a_and_b.push(key.to_string()),
732            (true, false) => a_only.push(key.to_string()),
733            (false, true) => b_only.push(key.to_string()),
734            (false, false) => unreachable!(),
735        }
736    }
737
738    let mut a_count = HashMap::new();
739    let mut b_count = HashMap::new();
740    let mut a_and_b_count = HashMap::new();
741    let mut a_and_b_same_order = HashMap::new();
742
743    for (k, v) in &arrays_a {
744        a_count.insert(k.clone(), v.len());
745    }
746    for (k, v) in &arrays_b {
747        b_count.insert(k.clone(), v.len());
748    }
749
750    for attr in &a_and_b {
751        let arr_a = arrays_a.get(attr).unwrap();
752        let arr_b = arrays_b.get(attr).unwrap();
753        let (overlap, same_order) = compare_elements(arr_a, arr_b);
754        a_and_b_count.insert(attr.clone(), overlap);
755        a_and_b_same_order.insert(attr.clone(), same_order);
756    }
757
758    SeqColComparison {
759        digests: ComparisonDigests {
760            a: digest_a,
761            b: digest_b,
762        },
763        attributes: AttributeComparison {
764            a_only,
765            b_only,
766            a_and_b,
767        },
768        array_elements: ArrayElementComparison {
769            a_count,
770            b_count,
771            a_and_b_count,
772            a_and_b_same_order,
773        },
774    }
775}
776
777/// Convert a `CollectionLevel2` into the comparison array map format.
778/// Only includes the three core attributes (names, lengths, sequences).
779/// Ancillary attributes are not included because `CollectionLevel2` does not carry them.
780pub(crate) fn level2_to_comparison_arrays(level2: &CollectionLevel2) -> HashMap<String, Vec<String>> {
781    let mut map = HashMap::new();
782    map.insert("names".to_string(), level2.names.clone());
783    map.insert(
784        "lengths".to_string(),
785        level2.lengths.iter().map(|l| l.to_string()).collect(),
786    );
787    map.insert("sequences".to_string(), level2.sequences.clone());
788    map
789}
790
791impl Display for SequenceCollection {
792    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
793        write!(
794            f,
795            "SequenceCollection with {} sequences, digest: {}",
796            self.sequences.len(),
797            self.metadata.digest
798        )?;
799        write!(f, "\nFirst 3 sequences:")?;
800        for seqrec in self.sequences.iter().take(3) {
801            write!(f, "\n- {}", seqrec)?;
802        }
803        Ok(())
804    }
805}
806
807// Iterator implementations for SequenceCollection
808// Allows: for seq in &collection { ... }
809impl<'a> IntoIterator for &'a SequenceCollection {
810    type Item = &'a SequenceRecord;
811    type IntoIter = std::slice::Iter<'a, SequenceRecord>;
812
813    fn into_iter(self) -> Self::IntoIter {
814        self.sequences.iter()
815    }
816}
817
818// Consuming iterator
819// Allows: for seq in collection { ... } (consumes the collection)
820impl IntoIterator for SequenceCollection {
821    type Item = SequenceRecord;
822    type IntoIter = std::vec::IntoIter<SequenceRecord>;
823
824    fn into_iter(self) -> Self::IntoIter {
825        self.sequences.into_iter()
826    }
827}
828
829/// A collection record that may or may not have its sequence list loaded.
830/// Parallel to SequenceRecord.
831#[derive(Clone, Debug)]
832pub enum SequenceCollectionRecord {
833    /// Collection with only metadata, sequence list not loaded
834    Stub(SequenceCollectionMetadata),
835    /// Collection with metadata and the actual sequence list
836    Full {
837        metadata: SequenceCollectionMetadata,
838        sequences: Vec<SequenceRecord>,
839    },
840}
841
842impl SequenceCollectionRecord {
843    /// Get metadata regardless of variant
844    pub fn metadata(&self) -> &SequenceCollectionMetadata {
845        match self {
846            SequenceCollectionRecord::Stub(meta) => meta,
847            SequenceCollectionRecord::Full { metadata, .. } => metadata,
848        }
849    }
850
851    /// Get sequences if loaded
852    pub fn sequences(&self) -> Option<&[SequenceRecord]> {
853        match self {
854            SequenceCollectionRecord::Stub(_) => None,
855            SequenceCollectionRecord::Full { sequences, .. } => Some(sequences),
856        }
857    }
858
859    /// Check if sequences are loaded
860    pub fn has_sequences(&self) -> bool {
861        matches!(self, SequenceCollectionRecord::Full { .. })
862    }
863
864    /// Load sequences into a Stub record, converting to Full
865    pub fn with_sequences(self, sequences: Vec<SequenceRecord>) -> Self {
866        let metadata = match self {
867            SequenceCollectionRecord::Stub(m) => m,
868            SequenceCollectionRecord::Full { metadata, .. } => metadata,
869        };
870        SequenceCollectionRecord::Full {
871            metadata,
872            sequences,
873        }
874    }
875
876    /// Convert to a SequenceCollection (requires Full variant or empty collection for Stub)
877    pub fn to_collection(&self) -> SequenceCollection {
878        match self {
879            SequenceCollectionRecord::Stub(meta) => {
880                // Create empty collection with metadata
881                SequenceCollection {
882                    metadata: meta.clone(),
883                    sequences: Vec::new(),
884                }
885            }
886            SequenceCollectionRecord::Full {
887                metadata,
888                sequences,
889            } => SequenceCollection {
890                metadata: metadata.clone(),
891                sequences: sequences.clone(),
892            },
893        }
894    }
895}
896
897impl From<SequenceCollection> for SequenceCollectionRecord {
898    fn from(collection: SequenceCollection) -> Self {
899        SequenceCollectionRecord::Full {
900            metadata: collection.metadata,
901            sequences: collection.sequences,
902        }
903    }
904}
905
906// ============================================================================
907// Pure computation functions
908// ============================================================================
909
910/// Create a SequenceRecord from raw data, computing all metadata.
911///
912/// This is the sequence-level parallel to `digest_fasta()` for collections.
913/// It computes the GA4GH sha512t24u digest, MD5 digest, detects the alphabet,
914/// and packages everything into a SequenceRecord with Full variant.
915///
916/// # Arguments
917/// * `name` - The sequence name (e.g., "chr1")
918/// * `data` - The raw sequence bytes (e.g., b"ACGTACGT")
919///
920/// # Returns
921/// A SequenceRecord::Full with computed metadata and the original data
922///
923/// # Example
924/// ```
925/// use gtars_refget::digest::types::digest_sequence;
926///
927/// let seq = digest_sequence("chr1", b"ACGTACGT");
928/// assert_eq!(seq.metadata().name, "chr1");
929/// assert_eq!(seq.metadata().length, 8);
930/// assert!(!seq.metadata().sha512t24u.is_empty());
931/// ```
932pub fn digest_sequence(name: &str, data: &[u8]) -> SequenceRecord {
933    // Uppercase the data for consistent digest computation (matches FASTA processing)
934    let uppercased: Vec<u8> = data.iter().map(|b| b.to_ascii_uppercase()).collect();
935
936    let metadata = SequenceMetadata {
937        name: name.to_string(),
938        description: None,
939        length: data.len(),
940        sha512t24u: sha512t24u(&uppercased),
941        md5: md5(&uppercased),
942        alphabet: guess_alphabet(&uppercased),
943        fai: None, // No FAI data for programmatically created sequences
944    };
945    SequenceRecord::Full {
946        metadata,
947        sequence: uppercased,
948    }
949}
950
951/// Create a SequenceRecord with a description field.
952///
953/// Same as `digest_sequence()` but allows specifying an optional description.
954///
955/// # Arguments
956/// * `name` - The sequence name (e.g., "chr1")
957/// * `description` - Optional description text
958/// * `data` - The raw sequence bytes (e.g., b"ACGTACGT")
959///
960/// # Returns
961/// A SequenceRecord::Full with computed metadata and the original data
962pub fn digest_sequence_with_description(
963    name: &str,
964    description: Option<&str>,
965    data: &[u8],
966) -> SequenceRecord {
967    let mut seq = digest_sequence(name, data);
968    if let SequenceRecord::Full {
969        ref mut metadata, ..
970    } = seq
971    {
972        metadata.description = description.map(String::from);
973    }
974    seq
975}
976
977/// Parse a single RGSI line into SequenceMetadata.
978///
979/// Supports two formats:
980/// - 5-column (no description): `name\tlength\talphabet\tsha512t24u\tmd5`
981/// - 6-column (with description): `name\tlength\talphabet\tsha512t24u\tmd5\tdescription`
982///
983/// Returns None if the line is a comment, empty, or has wrong column count.
984pub fn parse_rgsi_line(line: &str) -> Option<SequenceMetadata> {
985    // Skip empty lines
986    if line.trim().is_empty() {
987        return None;
988    }
989
990    let parts: Vec<&str> = line.split('\t').collect();
991
992    match parts.len() {
993        // 5-column format: no description
994        5 => Some(SequenceMetadata {
995            name: parts[0].to_string(),
996            description: None,
997            length: parts[1].parse().ok()?,
998            alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
999            sha512t24u: parts[3].to_string(),
1000            md5: parts[4].to_string(),
1001            fai: None,
1002        }),
1003        // 6-column format: description at end
1004        6 => Some(SequenceMetadata {
1005            name: parts[0].to_string(),
1006            description: if parts[5].is_empty() {
1007                None
1008            } else {
1009                Some(parts[5].to_string())
1010            },
1011            length: parts[1].parse().ok()?,
1012            alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
1013            sha512t24u: parts[3].to_string(),
1014            md5: parts[4].to_string(),
1015            fai: None,
1016        }),
1017        _ => None,
1018    }
1019}
1020
1021/// Parse a single line from an RGCI (collection index) file.
1022///
1023/// RGCI format is tab-separated with 5+ columns:
1024/// digest, n_sequences, names_digest, sequences_digest, lengths_digest,
1025/// [name_length_pairs_digest, sorted_name_length_pairs_digest, sorted_sequences_digest]
1026///
1027/// Lines starting with '#' are treated as comments and return None.
1028/// Lines with fewer than 5 columns return None.
1029/// Columns 5-7 are optional ancillary digests (empty string = None).
1030pub fn parse_rgci_line(line: &str) -> Option<SequenceCollectionMetadata> {
1031    if line.starts_with('#') {
1032        return None;
1033    }
1034    let parts: Vec<&str> = line.split('\t').collect();
1035    if parts.len() < 5 {
1036        return None;
1037    }
1038    // Parse optional ancillary digest columns (empty string -> None)
1039    let opt_col = |i: usize| -> Option<String> {
1040        parts.get(i).and_then(|s| {
1041            if s.is_empty() { None } else { Some(s.to_string()) }
1042        })
1043    };
1044    Some(SequenceCollectionMetadata {
1045        digest: parts[0].to_string(),
1046        n_sequences: parts[1].parse().ok()?,
1047        names_digest: parts[2].to_string(),
1048        sequences_digest: parts[3].to_string(),
1049        lengths_digest: parts[4].to_string(),
1050        name_length_pairs_digest: opt_col(5),
1051        sorted_name_length_pairs_digest: opt_col(6),
1052        sorted_sequences_digest: opt_col(7),
1053        file_path: None,
1054    })
1055}
1056
1057#[cfg(test)]
1058mod tests {
1059    use super::*;
1060
1061    fn test_metadata() -> Vec<SequenceMetadata> {
1062        vec![
1063            SequenceMetadata {
1064                name: "chrX".to_string(),
1065                description: None,
1066                length: 8,
1067                sha512t24u: "abc123".to_string(),
1068                md5: "md5abc".to_string(),
1069                alphabet: AlphabetType::Dna2bit,
1070                fai: None,
1071            },
1072            SequenceMetadata {
1073                name: "chr1".to_string(),
1074                description: None,
1075                length: 4,
1076                sha512t24u: "def456".to_string(),
1077                md5: "md5def".to_string(),
1078                alphabet: AlphabetType::Dna2bit,
1079                fai: None,
1080            },
1081        ]
1082    }
1083
1084    #[test]
1085    fn test_ancillary_digest_nlp() {
1086        let metadata = test_metadata();
1087        let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1088        let nlp = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs);
1089        assert!(!nlp.is_empty());
1090        assert_eq!(nlp.len(), 32); // SHA-512/24u is 32 chars base64url
1091    }
1092
1093    #[test]
1094    fn test_ancillary_digest_snlp() {
1095        let metadata = test_metadata();
1096        let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1097        let snlp = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs);
1098        assert!(!snlp.is_empty());
1099        assert_eq!(snlp.len(), 32);
1100    }
1101
1102    #[test]
1103    fn test_ancillary_digest_sorted_sequences() {
1104        let metadata = test_metadata();
1105        let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1106        let ss = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs);
1107        assert!(!ss.is_empty());
1108        assert_eq!(ss.len(), 32);
1109    }
1110
1111    #[test]
1112    fn test_nlp_and_snlp_both_valid() {
1113        let metadata = test_metadata();
1114        let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1115        let nlp = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs);
1116        let snlp = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs);
1117        // Both should be valid 32-char digests
1118        assert_eq!(nlp.len(), 32);
1119        assert_eq!(snlp.len(), 32);
1120        // NLP and SNLP may or may not be equal depending on whether pair digests
1121        // are already sorted. The key property is that SNLP is order-invariant
1122        // (tested in test_snlp_order_invariant).
1123    }
1124
1125    #[test]
1126    fn test_snlp_order_invariant() {
1127        let metadata = test_metadata();
1128        // Reverse the order
1129        let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1130
1131        let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
1132        let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
1133
1134        let snlp1 = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs1);
1135        let snlp2 = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs2);
1136
1137        // Sorted NLP should be the same regardless of input order
1138        assert_eq!(snlp1, snlp2);
1139    }
1140
1141    #[test]
1142    fn test_sorted_sequences_order_invariant() {
1143        let metadata = test_metadata();
1144        let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1145
1146        let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
1147        let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
1148
1149        let ss1 = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs1);
1150        let ss2 = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs2);
1151
1152        // Sorted sequences should be the same regardless of input order
1153        assert_eq!(ss1, ss2);
1154    }
1155
1156    #[test]
1157    fn test_nlp_order_sensitive() {
1158        let metadata = test_metadata();
1159        let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1160
1161        let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
1162        let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
1163
1164        let nlp1 = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs1);
1165        let nlp2 = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs2);
1166
1167        // Unsorted NLP should differ when order changes
1168        assert_ne!(nlp1, nlp2);
1169    }
1170
1171    #[test]
1172    fn test_to_level1() {
1173        let metadata = test_metadata();
1174        let records: Vec<_> = metadata
1175            .iter()
1176            .map(|m| SequenceRecord::Stub(m.clone()))
1177            .collect();
1178        let mut coll_meta = SequenceCollectionMetadata::from_sequences(&records, None);
1179        coll_meta.compute_ancillary_digests(&records);
1180
1181        let lvl1 = coll_meta.to_level1();
1182        assert_eq!(lvl1.names, coll_meta.names_digest);
1183        assert_eq!(lvl1.lengths, coll_meta.lengths_digest);
1184        assert_eq!(lvl1.sequences, coll_meta.sequences_digest);
1185        assert!(lvl1.name_length_pairs.is_some());
1186        assert!(lvl1.sorted_name_length_pairs.is_some());
1187        assert!(lvl1.sorted_sequences.is_some());
1188    }
1189
1190    #[test]
1191    fn test_to_level2() {
1192        let metadata = test_metadata();
1193        let records: Vec<SequenceRecord> = metadata
1194            .iter()
1195            .map(|m| SequenceRecord::Stub(m.clone()))
1196            .collect();
1197        let collection = SequenceCollection::from_records(records);
1198
1199        let lvl2 = collection.to_level2();
1200        assert_eq!(lvl2.names, vec!["chrX", "chr1"]);
1201        assert_eq!(lvl2.lengths, vec![8, 4]);
1202        assert_eq!(lvl2.sequences.len(), 2);
1203        assert!(lvl2.sequences[0].starts_with("SQ."));
1204    }
1205
1206    #[test]
1207    fn test_compare_same() {
1208        let records: Vec<SequenceRecord> = test_metadata().into_iter().map(SequenceRecord::Stub).collect();
1209        let collection = SequenceCollection::from_records(records);
1210        let result = collection.compare(&collection);
1211
1212        assert_eq!(Some(result.digests.a.as_str()), result.digests.b.as_deref());
1213        assert_eq!(result.attributes.a_and_b.len(), 3); // core only, no ancillary
1214        for attr in &result.attributes.a_and_b {
1215            assert_eq!(result.array_elements.a_and_b_same_order[attr], Some(true));
1216        }
1217    }
1218
1219    #[test]
1220    fn test_compare_reversed_order() {
1221        let metadata = test_metadata();
1222        let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1223        let coll_a = SequenceCollection::from_records(metadata.into_iter().map(SequenceRecord::Stub).collect());
1224        let coll_b = SequenceCollection::from_records(reversed.into_iter().map(SequenceRecord::Stub).collect());
1225
1226        let result = coll_a.compare(&coll_b);
1227        for attr in &result.attributes.a_and_b {
1228            assert_eq!(result.array_elements.a_and_b_count[attr], 2);
1229            assert_eq!(result.array_elements.a_and_b_same_order[attr], Some(false));
1230        }
1231    }
1232
1233    #[test]
1234    fn test_compare_single_element() {
1235        let meta = SequenceMetadata {
1236            name: "chr1".to_string(),
1237            description: None,
1238            length: 4,
1239            sha512t24u: "abc".to_string(),
1240            md5: "md5".to_string(),
1241            alphabet: AlphabetType::Dna2bit,
1242            fai: None,
1243        };
1244        let coll = SequenceCollection::from_records(vec![SequenceRecord::Stub(meta)]);
1245        let result = coll.compare(&coll);
1246
1247        // With only 1 element overlap, same_order should be None
1248        for attr in &result.attributes.a_and_b {
1249            assert_eq!(result.array_elements.a_and_b_same_order[attr], None);
1250        }
1251    }
1252}
gtars_refget/digest/types.rs

gtars_refget/digest/
types.rs