Skip to main content

gtars_refget/digest/
types.rs

1//! Core types for sequence collections - WASM-safe.
2//!
3//! This module contains the fundamental data structures for representing sequences
4//! and sequence collections. All types here are WASM-compatible and don't require
5//! filesystem access.
6
7use serde::{Deserialize, Serialize};
8use std::fmt::Display;
9use std::path::PathBuf;
10
11use super::algorithms::{canonicalize_json, md5, sha512t24u};
12use super::alphabet::{AlphabetType, guess_alphabet};
13
14/// Metadata for a single sequence, including its name, length, digests, and alphabet type.
15#[derive(Clone, Debug, Serialize, Deserialize)]
16pub struct SequenceMetadata {
17    pub name: String,
18    /// Description from FASTA header (text after first whitespace).
19    #[serde(default)]
20    pub description: Option<String>,
21    pub length: usize,
22    pub sha512t24u: String,
23    pub md5: String,
24    pub alphabet: AlphabetType,
25    pub fai: Option<FaiMetadata>,
26}
27
28impl Default for SequenceMetadata {
29    fn default() -> Self {
30        Self {
31            name: String::new(),
32            description: None,
33            length: 0,
34            sha512t24u: String::new(),
35            md5: String::new(),
36            alphabet: AlphabetType::Ascii,
37            fai: None,
38        }
39    }
40}
41
42/// FASTA index (FAI) metadata for a sequence.
43/// This data is only present when a sequence was loaded from a FASTA file.
44#[derive(Clone, Debug, Serialize, Deserialize)]
45pub struct FaiMetadata {
46    pub offset: u64,     // byte offset to first base of sequence data
47    pub line_bases: u32, // number of bases per line
48    pub line_bytes: u32, // number of bytes per line (including newline chars)
49}
50
51/// A representation of a single sequence that includes metadata and optionally data.
52/// Combines sequence metadata with optional raw/encoded data.
53///
54/// This enum has two variants:
55/// - `Stub`: Contains only metadata, no sequence data loaded
56/// - `Full`: Contains both metadata and the actual sequence data
57#[derive(Clone, Debug)]
58pub enum SequenceRecord {
59    /// A sequence record with only metadata, no sequence data
60    Stub(SequenceMetadata),
61    /// A sequence record with both metadata and sequence data
62    Full {
63        metadata: SequenceMetadata,
64        sequence: Vec<u8>,
65    },
66}
67
68impl SequenceRecord {
69    /// Get metadata regardless of variant
70    pub fn metadata(&self) -> &SequenceMetadata {
71        match self {
72            SequenceRecord::Stub(meta) => meta,
73            SequenceRecord::Full { metadata, .. } => metadata,
74        }
75    }
76
77    /// Get sequence data if present
78    pub fn sequence(&self) -> Option<&[u8]> {
79        match self {
80            SequenceRecord::Stub(_) => None,
81            SequenceRecord::Full { sequence, .. } => Some(sequence),
82        }
83    }
84
85    /// Check if sequence data is loaded (Full) or just metadata (Stub).
86    pub fn is_loaded(&self) -> bool {
87        matches!(self, SequenceRecord::Full { .. })
88    }
89
90    /// Load data into a Stub record, or replace data in a Full record (takes ownership)
91    pub fn with_data(self, sequence: Vec<u8>) -> Self {
92        let metadata = match self {
93            SequenceRecord::Stub(m) => m,
94            SequenceRecord::Full { metadata, .. } => metadata,
95        };
96        SequenceRecord::Full { metadata, sequence }
97    }
98
99    /// Load data into a Stub record in-place, converting it to Full.
100    /// If already Full, replaces the existing sequence data.
101    ///
102    /// This is more efficient than `with_data()` when you have a mutable reference,
103    /// as it avoids cloning the metadata.
104    pub fn load_data(&mut self, sequence: Vec<u8>) {
105        match self {
106            SequenceRecord::Stub(metadata) => {
107                // Take ownership of metadata without cloning
108                let metadata = std::mem::take(metadata);
109                *self = SequenceRecord::Full { metadata, sequence };
110            }
111            SequenceRecord::Full {
112                sequence: existing, ..
113            } => {
114                // Just replace the sequence data
115                *existing = sequence;
116            }
117        }
118    }
119
120    /// Decodes the sequence data to a string.
121    ///
122    /// This method attempts to decode the sequence data stored in this record.
123    /// It handles both raw (uncompressed UTF-8) and encoded (bit-packed) data.
124    /// The decoding strategy depends on the alphabet type:
125    /// - For ASCII alphabet: data is already in raw form, just convert to string
126    /// - For other alphabets: attempt encoded decoding first, fall back to raw
127    ///
128    /// # Returns
129    ///
130    /// * `Some(String)` - The decoded sequence if data is loaded
131    /// * `None` - If no data is loaded in this record
132    pub fn decode(&self) -> Option<String> {
133        use super::alphabet::lookup_alphabet;
134        use super::encoder::decode_substring_from_bytes;
135
136        let (metadata, data) = match self {
137            SequenceRecord::Stub(_) => return None,
138            SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
139        };
140
141        // For ASCII alphabet (8 bits per symbol), the data is always stored raw
142        if metadata.alphabet == AlphabetType::Ascii {
143            return String::from_utf8(data.clone()).ok();
144        }
145
146        // Try to detect if data is raw or encoded
147        // Heuristic: for encoded data, the size should be approximately length * bits_per_symbol / 8
148        // For raw data, the size should be approximately equal to length
149        let alphabet = lookup_alphabet(&metadata.alphabet);
150
151        // If data size matches the expected length (not the encoded size), it's probably raw
152        if data.len() == metadata.length {
153            // Try to decode as UTF-8
154            if let Ok(raw_string) = String::from_utf8(data.clone()) {
155                // Data appears to be raw UTF-8
156                return Some(raw_string);
157            }
158        }
159
160        // Data is probably encoded (size matches expected encoded size), try to decode it
161        let decoded_bytes = decode_substring_from_bytes(data, 0, metadata.length, alphabet);
162
163        // Convert to string
164        String::from_utf8(decoded_bytes).ok()
165    }
166}
167
168impl Display for SequenceRecord {
169    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170        write!(
171            f,
172            "SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})",
173            &self.metadata().name,
174            &self.metadata().length,
175            &self.metadata().alphabet,
176            &self.metadata().sha512t24u,
177            &self.metadata().md5
178        )?;
179        Ok(())
180    }
181}
182
183/// A struct representing the first level of digests for a refget sequence collection.
184#[derive(Debug, Serialize, Deserialize, Clone)]
185pub struct SeqColDigestLvl1 {
186    pub sequences_digest: String,
187    pub names_digest: String,
188    pub lengths_digest: String,
189}
190
191impl SeqColDigestLvl1 {
192    /// Compute collection digest from lvl1 digests
193    pub fn to_digest(&self) -> String {
194        // Create JSON object with the lvl1 digest strings
195        let mut lvl1_object = serde_json::Map::new();
196        lvl1_object.insert(
197            "names".to_string(),
198            serde_json::Value::String(self.names_digest.clone()),
199        );
200        lvl1_object.insert(
201            "sequences".to_string(),
202            serde_json::Value::String(self.sequences_digest.clone()),
203        );
204
205        let lvl1_json = serde_json::Value::Object(lvl1_object);
206
207        // Canonicalize the JSON object and compute collection digest
208        let lvl1_canonical = canonicalize_json(&lvl1_json);
209        sha512t24u(lvl1_canonical.as_bytes())
210    }
211
212    /// Compute lvl1 digests from a collection of SequenceMetadata
213    pub fn from_metadata(metadata_vec: &[&SequenceMetadata]) -> Self {
214        use serde_json::Value;
215
216        // Extract arrays for each field
217        let sequences: Vec<String> = metadata_vec
218            .iter()
219            .map(|md| format!("SQ.{}", md.sha512t24u))
220            .collect();
221        let names: Vec<&str> = metadata_vec.iter().map(|md| md.name.as_str()).collect();
222        let lengths: Vec<usize> = metadata_vec.iter().map(|md| md.length).collect();
223
224        // Convert to JSON Values and canonicalize
225        let sequences_json = Value::Array(
226            sequences
227                .iter()
228                .map(|s| Value::String(s.to_string()))
229                .collect(),
230        );
231        let names_json = Value::Array(names.iter().map(|s| Value::String(s.to_string())).collect());
232        let lengths_json = Value::Array(
233            lengths
234                .iter()
235                .map(|l| Value::Number(serde_json::Number::from(*l)))
236                .collect(),
237        );
238
239        // Canonicalize to JCS format
240        let sequences_canonical = canonicalize_json(&sequences_json);
241        let names_canonical = canonicalize_json(&names_json);
242        let lengths_canonical = canonicalize_json(&lengths_json);
243
244        // Hash the canonicalized arrays
245        SeqColDigestLvl1 {
246            sequences_digest: sha512t24u(sequences_canonical.as_bytes()),
247            names_digest: sha512t24u(names_canonical.as_bytes()),
248            lengths_digest: sha512t24u(lengths_canonical.as_bytes()),
249        }
250    }
251}
252
253/// Metadata for a sequence collection (parallel to SequenceMetadata).
254/// Contains the collection digest and level 1 digests for names, sequences, and lengths.
255#[derive(Clone, Debug, Serialize, Deserialize)]
256pub struct SequenceCollectionMetadata {
257    /// Top-level seqcol digest
258    pub digest: String,
259    /// Number of sequences in the collection
260    pub n_sequences: usize,
261    /// Level 1 digest of names array
262    pub names_digest: String,
263    /// Level 1 digest of sequences array
264    pub sequences_digest: String,
265    /// Level 1 digest of lengths array
266    pub lengths_digest: String,
267    /// Optional path to the source file
268    pub file_path: Option<PathBuf>,
269}
270
271impl SequenceCollectionMetadata {
272    /// Compute metadata from sequence records
273    pub fn from_sequences(sequences: &[SequenceRecord], file_path: Option<PathBuf>) -> Self {
274        // Extract metadata refs
275        let metadata_refs: Vec<&SequenceMetadata> =
276            sequences.iter().map(|r| r.metadata()).collect();
277
278        // Compute level 1 digests
279        let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs);
280
281        // Compute top-level digest from level 1 digests
282        let digest = lvl1.to_digest();
283
284        Self {
285            digest,
286            n_sequences: sequences.len(),
287            names_digest: lvl1.names_digest,
288            sequences_digest: lvl1.sequences_digest,
289            lengths_digest: lvl1.lengths_digest,
290            file_path,
291        }
292    }
293
294    /// Create from an existing SequenceCollection
295    pub fn from_collection(collection: &SequenceCollection) -> Self {
296        collection.metadata.clone()
297    }
298
299    /// Convert to SeqColDigestLvl1 for compatibility
300    pub fn to_lvl1(&self) -> SeqColDigestLvl1 {
301        SeqColDigestLvl1 {
302            sequences_digest: self.sequences_digest.clone(),
303            names_digest: self.names_digest.clone(),
304            lengths_digest: self.lengths_digest.clone(),
305        }
306    }
307}
308
309/// A single Sequence Collection, which may or may not hold data.
310#[derive(Clone, Debug)]
311pub struct SequenceCollection {
312    /// Collection metadata (digest, level 1 digests, n_sequences, file_path)
313    pub metadata: SequenceCollectionMetadata,
314
315    /// Vector of SequenceRecords, which contain metadata (name, length, digests, alphabet)
316    /// and optionally the actual sequence data.
317    pub sequences: Vec<SequenceRecord>,
318}
319
320impl SequenceCollection {
321    /// Create a SequenceCollection from a vector of SequenceRecords.
322    pub fn from_records(records: Vec<SequenceRecord>) -> Self {
323        // Compute metadata from the sequence records
324        let metadata = SequenceCollectionMetadata::from_sequences(&records, None);
325
326        SequenceCollection {
327            metadata,
328            sequences: records,
329        }
330    }
331}
332
333impl Display for SequenceCollection {
334    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
335        write!(
336            f,
337            "SequenceCollection with {} sequences, digest: {}",
338            self.sequences.len(),
339            self.metadata.digest
340        )?;
341        write!(f, "\nFirst 3 sequences:")?;
342        for seqrec in self.sequences.iter().take(3) {
343            write!(f, "\n- {}", seqrec)?;
344        }
345        Ok(())
346    }
347}
348
349// Iterator implementations for SequenceCollection
350// Allows: for seq in &collection { ... }
351impl<'a> IntoIterator for &'a SequenceCollection {
352    type Item = &'a SequenceRecord;
353    type IntoIter = std::slice::Iter<'a, SequenceRecord>;
354
355    fn into_iter(self) -> Self::IntoIter {
356        self.sequences.iter()
357    }
358}
359
360// Consuming iterator
361// Allows: for seq in collection { ... } (consumes the collection)
362impl IntoIterator for SequenceCollection {
363    type Item = SequenceRecord;
364    type IntoIter = std::vec::IntoIter<SequenceRecord>;
365
366    fn into_iter(self) -> Self::IntoIter {
367        self.sequences.into_iter()
368    }
369}
370
371/// A collection record that may or may not have its sequence list loaded.
372/// Parallel to SequenceRecord.
373#[derive(Clone, Debug)]
374pub enum SequenceCollectionRecord {
375    /// Collection with only metadata, sequence list not loaded
376    Stub(SequenceCollectionMetadata),
377    /// Collection with metadata and the actual sequence list
378    Full {
379        metadata: SequenceCollectionMetadata,
380        sequences: Vec<SequenceRecord>,
381    },
382}
383
384impl SequenceCollectionRecord {
385    /// Get metadata regardless of variant
386    pub fn metadata(&self) -> &SequenceCollectionMetadata {
387        match self {
388            SequenceCollectionRecord::Stub(meta) => meta,
389            SequenceCollectionRecord::Full { metadata, .. } => metadata,
390        }
391    }
392
393    /// Get sequences if loaded
394    pub fn sequences(&self) -> Option<&[SequenceRecord]> {
395        match self {
396            SequenceCollectionRecord::Stub(_) => None,
397            SequenceCollectionRecord::Full { sequences, .. } => Some(sequences),
398        }
399    }
400
401    /// Check if sequences are loaded
402    pub fn has_sequences(&self) -> bool {
403        matches!(self, SequenceCollectionRecord::Full { .. })
404    }
405
406    /// Load sequences into a Stub record, converting to Full
407    pub fn with_sequences(self, sequences: Vec<SequenceRecord>) -> Self {
408        let metadata = match self {
409            SequenceCollectionRecord::Stub(m) => m,
410            SequenceCollectionRecord::Full { metadata, .. } => metadata,
411        };
412        SequenceCollectionRecord::Full {
413            metadata,
414            sequences,
415        }
416    }
417
418    /// Convert to a SequenceCollection (requires Full variant or empty collection for Stub)
419    pub fn to_collection(&self) -> SequenceCollection {
420        match self {
421            SequenceCollectionRecord::Stub(meta) => {
422                // Create empty collection with metadata
423                SequenceCollection {
424                    metadata: meta.clone(),
425                    sequences: Vec::new(),
426                }
427            }
428            SequenceCollectionRecord::Full {
429                metadata,
430                sequences,
431            } => SequenceCollection {
432                metadata: metadata.clone(),
433                sequences: sequences.clone(),
434            },
435        }
436    }
437}
438
439impl From<SequenceCollection> for SequenceCollectionRecord {
440    fn from(collection: SequenceCollection) -> Self {
441        SequenceCollectionRecord::Full {
442            metadata: collection.metadata,
443            sequences: collection.sequences,
444        }
445    }
446}
447
448// ============================================================================
449// Pure computation functions
450// ============================================================================
451
452/// Create a SequenceRecord from raw data, computing all metadata.
453///
454/// This is the sequence-level parallel to `digest_fasta()` for collections.
455/// It computes the GA4GH sha512t24u digest, MD5 digest, detects the alphabet,
456/// and packages everything into a SequenceRecord with Full variant.
457///
458/// # Arguments
459/// * `name` - The sequence name (e.g., "chr1")
460/// * `data` - The raw sequence bytes (e.g., b"ACGTACGT")
461///
462/// # Returns
463/// A SequenceRecord::Full with computed metadata and the original data
464///
465/// # Example
466/// ```
467/// use gtars_refget::digest::types::digest_sequence;
468///
469/// let seq = digest_sequence("chr1", b"ACGTACGT");
470/// assert_eq!(seq.metadata().name, "chr1");
471/// assert_eq!(seq.metadata().length, 8);
472/// assert!(!seq.metadata().sha512t24u.is_empty());
473/// ```
474pub fn digest_sequence(name: &str, data: &[u8]) -> SequenceRecord {
475    // Uppercase the data for consistent digest computation (matches FASTA processing)
476    let uppercased: Vec<u8> = data.iter().map(|b| b.to_ascii_uppercase()).collect();
477
478    let metadata = SequenceMetadata {
479        name: name.to_string(),
480        description: None,
481        length: data.len(),
482        sha512t24u: sha512t24u(&uppercased),
483        md5: md5(&uppercased),
484        alphabet: guess_alphabet(&uppercased),
485        fai: None, // No FAI data for programmatically created sequences
486    };
487    SequenceRecord::Full {
488        metadata,
489        sequence: uppercased,
490    }
491}
492
493/// Create a SequenceRecord with a description field.
494///
495/// Same as `digest_sequence()` but allows specifying an optional description.
496///
497/// # Arguments
498/// * `name` - The sequence name (e.g., "chr1")
499/// * `description` - Optional description text
500/// * `data` - The raw sequence bytes (e.g., b"ACGTACGT")
501///
502/// # Returns
503/// A SequenceRecord::Full with computed metadata and the original data
504pub fn digest_sequence_with_description(
505    name: &str,
506    description: Option<&str>,
507    data: &[u8],
508) -> SequenceRecord {
509    let mut seq = digest_sequence(name, data);
510    if let SequenceRecord::Full {
511        ref mut metadata, ..
512    } = seq
513    {
514        metadata.description = description.map(String::from);
515    }
516    seq
517}
518
519/// Parse a single RGSI line into SequenceMetadata.
520///
521/// Supports two formats:
522/// - 5-column (no description): `name\tlength\talphabet\tsha512t24u\tmd5`
523/// - 6-column (with description): `name\tlength\talphabet\tsha512t24u\tmd5\tdescription`
524///
525/// Returns None if the line is a comment, empty, or has wrong column count.
526pub fn parse_rgsi_line(line: &str) -> Option<SequenceMetadata> {
527    // Skip empty lines
528    if line.trim().is_empty() {
529        return None;
530    }
531
532    let parts: Vec<&str> = line.split('\t').collect();
533
534    match parts.len() {
535        // 5-column format: no description
536        5 => Some(SequenceMetadata {
537            name: parts[0].to_string(),
538            description: None,
539            length: parts[1].parse().ok()?,
540            alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
541            sha512t24u: parts[3].to_string(),
542            md5: parts[4].to_string(),
543            fai: None,
544        }),
545        // 6-column format: description at end
546        6 => Some(SequenceMetadata {
547            name: parts[0].to_string(),
548            description: if parts[5].is_empty() {
549                None
550            } else {
551                Some(parts[5].to_string())
552            },
553            length: parts[1].parse().ok()?,
554            alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
555            sha512t24u: parts[3].to_string(),
556            md5: parts[4].to_string(),
557            fai: None,
558        }),
559        _ => None,
560    }
561}