1use serde::{Deserialize, Serialize};
8use std::fmt::Display;
9use std::path::PathBuf;
10
11use super::algorithms::{canonicalize_json, md5, sha512t24u};
12use super::alphabet::{AlphabetType, guess_alphabet};
13
14#[derive(Clone, Debug, Serialize, Deserialize)]
16pub struct SequenceMetadata {
17 pub name: String,
18 #[serde(default)]
20 pub description: Option<String>,
21 pub length: usize,
22 pub sha512t24u: String,
23 pub md5: String,
24 pub alphabet: AlphabetType,
25 pub fai: Option<FaiMetadata>,
26}
27
28impl Default for SequenceMetadata {
29 fn default() -> Self {
30 Self {
31 name: String::new(),
32 description: None,
33 length: 0,
34 sha512t24u: String::new(),
35 md5: String::new(),
36 alphabet: AlphabetType::Ascii,
37 fai: None,
38 }
39 }
40}
41
42#[derive(Clone, Debug, Serialize, Deserialize)]
45pub struct FaiMetadata {
46 pub offset: u64, pub line_bases: u32, pub line_bytes: u32, }
50
51#[derive(Clone, Debug)]
58pub enum SequenceRecord {
59 Stub(SequenceMetadata),
61 Full {
63 metadata: SequenceMetadata,
64 sequence: Vec<u8>,
65 },
66}
67
68impl SequenceRecord {
69 pub fn metadata(&self) -> &SequenceMetadata {
71 match self {
72 SequenceRecord::Stub(meta) => meta,
73 SequenceRecord::Full { metadata, .. } => metadata,
74 }
75 }
76
77 pub fn sequence(&self) -> Option<&[u8]> {
79 match self {
80 SequenceRecord::Stub(_) => None,
81 SequenceRecord::Full { sequence, .. } => Some(sequence),
82 }
83 }
84
85 pub fn is_loaded(&self) -> bool {
87 matches!(self, SequenceRecord::Full { .. })
88 }
89
90 pub fn with_data(self, sequence: Vec<u8>) -> Self {
92 let metadata = match self {
93 SequenceRecord::Stub(m) => m,
94 SequenceRecord::Full { metadata, .. } => metadata,
95 };
96 SequenceRecord::Full { metadata, sequence }
97 }
98
99 pub fn load_data(&mut self, sequence: Vec<u8>) {
105 match self {
106 SequenceRecord::Stub(metadata) => {
107 let metadata = std::mem::take(metadata);
109 *self = SequenceRecord::Full { metadata, sequence };
110 }
111 SequenceRecord::Full {
112 sequence: existing, ..
113 } => {
114 *existing = sequence;
116 }
117 }
118 }
119
120 pub fn decode(&self) -> Option<String> {
133 use super::alphabet::lookup_alphabet;
134 use super::encoder::decode_substring_from_bytes;
135
136 let (metadata, data) = match self {
137 SequenceRecord::Stub(_) => return None,
138 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
139 };
140
141 if metadata.alphabet == AlphabetType::Ascii {
143 return String::from_utf8(data.clone()).ok();
144 }
145
146 let alphabet = lookup_alphabet(&metadata.alphabet);
150
151 if data.len() == metadata.length {
153 if let Ok(raw_string) = String::from_utf8(data.clone()) {
155 return Some(raw_string);
157 }
158 }
159
160 let decoded_bytes = decode_substring_from_bytes(data, 0, metadata.length, alphabet);
162
163 String::from_utf8(decoded_bytes).ok()
165 }
166}
167
168impl Display for SequenceRecord {
169 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170 write!(
171 f,
172 "SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})",
173 &self.metadata().name,
174 &self.metadata().length,
175 &self.metadata().alphabet,
176 &self.metadata().sha512t24u,
177 &self.metadata().md5
178 )?;
179 Ok(())
180 }
181}
182
183#[derive(Debug, Serialize, Deserialize, Clone)]
185pub struct SeqColDigestLvl1 {
186 pub sequences_digest: String,
187 pub names_digest: String,
188 pub lengths_digest: String,
189}
190
191impl SeqColDigestLvl1 {
192 pub fn to_digest(&self) -> String {
194 let mut lvl1_object = serde_json::Map::new();
196 lvl1_object.insert(
197 "names".to_string(),
198 serde_json::Value::String(self.names_digest.clone()),
199 );
200 lvl1_object.insert(
201 "sequences".to_string(),
202 serde_json::Value::String(self.sequences_digest.clone()),
203 );
204
205 let lvl1_json = serde_json::Value::Object(lvl1_object);
206
207 let lvl1_canonical = canonicalize_json(&lvl1_json);
209 sha512t24u(lvl1_canonical.as_bytes())
210 }
211
212 pub fn from_metadata(metadata_vec: &[&SequenceMetadata]) -> Self {
214 use serde_json::Value;
215
216 let sequences: Vec<String> = metadata_vec
218 .iter()
219 .map(|md| format!("SQ.{}", md.sha512t24u))
220 .collect();
221 let names: Vec<&str> = metadata_vec.iter().map(|md| md.name.as_str()).collect();
222 let lengths: Vec<usize> = metadata_vec.iter().map(|md| md.length).collect();
223
224 let sequences_json = Value::Array(
226 sequences
227 .iter()
228 .map(|s| Value::String(s.to_string()))
229 .collect(),
230 );
231 let names_json = Value::Array(names.iter().map(|s| Value::String(s.to_string())).collect());
232 let lengths_json = Value::Array(
233 lengths
234 .iter()
235 .map(|l| Value::Number(serde_json::Number::from(*l)))
236 .collect(),
237 );
238
239 let sequences_canonical = canonicalize_json(&sequences_json);
241 let names_canonical = canonicalize_json(&names_json);
242 let lengths_canonical = canonicalize_json(&lengths_json);
243
244 SeqColDigestLvl1 {
246 sequences_digest: sha512t24u(sequences_canonical.as_bytes()),
247 names_digest: sha512t24u(names_canonical.as_bytes()),
248 lengths_digest: sha512t24u(lengths_canonical.as_bytes()),
249 }
250 }
251}
252
253#[derive(Clone, Debug, Serialize, Deserialize)]
256pub struct SequenceCollectionMetadata {
257 pub digest: String,
259 pub n_sequences: usize,
261 pub names_digest: String,
263 pub sequences_digest: String,
265 pub lengths_digest: String,
267 pub file_path: Option<PathBuf>,
269}
270
271impl SequenceCollectionMetadata {
272 pub fn from_sequences(sequences: &[SequenceRecord], file_path: Option<PathBuf>) -> Self {
274 let metadata_refs: Vec<&SequenceMetadata> =
276 sequences.iter().map(|r| r.metadata()).collect();
277
278 let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs);
280
281 let digest = lvl1.to_digest();
283
284 Self {
285 digest,
286 n_sequences: sequences.len(),
287 names_digest: lvl1.names_digest,
288 sequences_digest: lvl1.sequences_digest,
289 lengths_digest: lvl1.lengths_digest,
290 file_path,
291 }
292 }
293
294 pub fn from_collection(collection: &SequenceCollection) -> Self {
296 collection.metadata.clone()
297 }
298
299 pub fn to_lvl1(&self) -> SeqColDigestLvl1 {
301 SeqColDigestLvl1 {
302 sequences_digest: self.sequences_digest.clone(),
303 names_digest: self.names_digest.clone(),
304 lengths_digest: self.lengths_digest.clone(),
305 }
306 }
307}
308
309#[derive(Clone, Debug)]
311pub struct SequenceCollection {
312 pub metadata: SequenceCollectionMetadata,
314
315 pub sequences: Vec<SequenceRecord>,
318}
319
320impl SequenceCollection {
321 pub fn from_records(records: Vec<SequenceRecord>) -> Self {
323 let metadata = SequenceCollectionMetadata::from_sequences(&records, None);
325
326 SequenceCollection {
327 metadata,
328 sequences: records,
329 }
330 }
331}
332
333impl Display for SequenceCollection {
334 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
335 write!(
336 f,
337 "SequenceCollection with {} sequences, digest: {}",
338 self.sequences.len(),
339 self.metadata.digest
340 )?;
341 write!(f, "\nFirst 3 sequences:")?;
342 for seqrec in self.sequences.iter().take(3) {
343 write!(f, "\n- {}", seqrec)?;
344 }
345 Ok(())
346 }
347}
348
349impl<'a> IntoIterator for &'a SequenceCollection {
352 type Item = &'a SequenceRecord;
353 type IntoIter = std::slice::Iter<'a, SequenceRecord>;
354
355 fn into_iter(self) -> Self::IntoIter {
356 self.sequences.iter()
357 }
358}
359
360impl IntoIterator for SequenceCollection {
363 type Item = SequenceRecord;
364 type IntoIter = std::vec::IntoIter<SequenceRecord>;
365
366 fn into_iter(self) -> Self::IntoIter {
367 self.sequences.into_iter()
368 }
369}
370
371#[derive(Clone, Debug)]
374pub enum SequenceCollectionRecord {
375 Stub(SequenceCollectionMetadata),
377 Full {
379 metadata: SequenceCollectionMetadata,
380 sequences: Vec<SequenceRecord>,
381 },
382}
383
384impl SequenceCollectionRecord {
385 pub fn metadata(&self) -> &SequenceCollectionMetadata {
387 match self {
388 SequenceCollectionRecord::Stub(meta) => meta,
389 SequenceCollectionRecord::Full { metadata, .. } => metadata,
390 }
391 }
392
393 pub fn sequences(&self) -> Option<&[SequenceRecord]> {
395 match self {
396 SequenceCollectionRecord::Stub(_) => None,
397 SequenceCollectionRecord::Full { sequences, .. } => Some(sequences),
398 }
399 }
400
401 pub fn has_sequences(&self) -> bool {
403 matches!(self, SequenceCollectionRecord::Full { .. })
404 }
405
406 pub fn with_sequences(self, sequences: Vec<SequenceRecord>) -> Self {
408 let metadata = match self {
409 SequenceCollectionRecord::Stub(m) => m,
410 SequenceCollectionRecord::Full { metadata, .. } => metadata,
411 };
412 SequenceCollectionRecord::Full {
413 metadata,
414 sequences,
415 }
416 }
417
418 pub fn to_collection(&self) -> SequenceCollection {
420 match self {
421 SequenceCollectionRecord::Stub(meta) => {
422 SequenceCollection {
424 metadata: meta.clone(),
425 sequences: Vec::new(),
426 }
427 }
428 SequenceCollectionRecord::Full {
429 metadata,
430 sequences,
431 } => SequenceCollection {
432 metadata: metadata.clone(),
433 sequences: sequences.clone(),
434 },
435 }
436 }
437}
438
439impl From<SequenceCollection> for SequenceCollectionRecord {
440 fn from(collection: SequenceCollection) -> Self {
441 SequenceCollectionRecord::Full {
442 metadata: collection.metadata,
443 sequences: collection.sequences,
444 }
445 }
446}
447
448pub fn digest_sequence(name: &str, data: &[u8]) -> SequenceRecord {
475 let uppercased: Vec<u8> = data.iter().map(|b| b.to_ascii_uppercase()).collect();
477
478 let metadata = SequenceMetadata {
479 name: name.to_string(),
480 description: None,
481 length: data.len(),
482 sha512t24u: sha512t24u(&uppercased),
483 md5: md5(&uppercased),
484 alphabet: guess_alphabet(&uppercased),
485 fai: None, };
487 SequenceRecord::Full {
488 metadata,
489 sequence: uppercased,
490 }
491}
492
493pub fn digest_sequence_with_description(
505 name: &str,
506 description: Option<&str>,
507 data: &[u8],
508) -> SequenceRecord {
509 let mut seq = digest_sequence(name, data);
510 if let SequenceRecord::Full {
511 ref mut metadata, ..
512 } = seq
513 {
514 metadata.description = description.map(String::from);
515 }
516 seq
517}
518
519pub fn parse_rgsi_line(line: &str) -> Option<SequenceMetadata> {
527 if line.trim().is_empty() {
529 return None;
530 }
531
532 let parts: Vec<&str> = line.split('\t').collect();
533
534 match parts.len() {
535 5 => Some(SequenceMetadata {
537 name: parts[0].to_string(),
538 description: None,
539 length: parts[1].parse().ok()?,
540 alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
541 sha512t24u: parts[3].to_string(),
542 md5: parts[4].to_string(),
543 fai: None,
544 }),
545 6 => Some(SequenceMetadata {
547 name: parts[0].to_string(),
548 description: if parts[5].is_empty() {
549 None
550 } else {
551 Some(parts[5].to_string())
552 },
553 length: parts[1].parse().ok()?,
554 alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
555 sha512t24u: parts[3].to_string(),
556 md5: parts[4].to_string(),
557 fai: None,
558 }),
559 _ => None,
560 }
561}