1use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::fmt::Display;
10use std::path::PathBuf;
11
12use super::algorithms::{canonicalize_json, md5, sha512t24u};
13use super::alphabet::{AlphabetType, guess_alphabet};
14
15#[derive(Clone, Debug, Serialize, Deserialize)]
17pub struct SequenceMetadata {
18 pub name: String,
19 #[serde(default)]
21 pub description: Option<String>,
22 pub length: usize,
23 pub sha512t24u: String,
24 pub md5: String,
25 pub alphabet: AlphabetType,
26 pub fai: Option<FaiMetadata>,
27}
28
29impl Default for SequenceMetadata {
30 fn default() -> Self {
31 Self {
32 name: String::new(),
33 description: None,
34 length: 0,
35 sha512t24u: String::new(),
36 md5: String::new(),
37 alphabet: AlphabetType::Ascii,
38 fai: None,
39 }
40 }
41}
42
43#[derive(Clone, Debug, Serialize, Deserialize)]
46pub struct FaiMetadata {
47 pub offset: u64, pub line_bases: u32, pub line_bytes: u32, }
51
52#[derive(Clone, Debug)]
59pub enum SequenceRecord {
60 Stub(SequenceMetadata),
62 Full {
64 metadata: SequenceMetadata,
65 sequence: Vec<u8>,
66 },
67}
68
69impl SequenceRecord {
70 pub fn metadata(&self) -> &SequenceMetadata {
72 match self {
73 SequenceRecord::Stub(meta) => meta,
74 SequenceRecord::Full { metadata, .. } => metadata,
75 }
76 }
77
78 pub fn sequence(&self) -> Option<&[u8]> {
80 match self {
81 SequenceRecord::Stub(_) => None,
82 SequenceRecord::Full { sequence, .. } => Some(sequence),
83 }
84 }
85
86 pub fn is_loaded(&self) -> bool {
88 matches!(self, SequenceRecord::Full { .. })
89 }
90
91 pub fn with_data(self, sequence: Vec<u8>) -> Self {
93 let metadata = match self {
94 SequenceRecord::Stub(m) => m,
95 SequenceRecord::Full { metadata, .. } => metadata,
96 };
97 SequenceRecord::Full { metadata, sequence }
98 }
99
100 pub fn load_data(&mut self, sequence: Vec<u8>) {
106 match self {
107 SequenceRecord::Stub(metadata) => {
108 let metadata = std::mem::take(metadata);
110 *self = SequenceRecord::Full { metadata, sequence };
111 }
112 SequenceRecord::Full {
113 sequence: existing, ..
114 } => {
115 *existing = sequence;
117 }
118 }
119 }
120
121 pub fn decode(&self) -> Option<String> {
134 use super::alphabet::lookup_alphabet;
135 use super::encoder::decode_substring_from_bytes;
136
137 let (metadata, data) = match self {
138 SequenceRecord::Stub(_) => return None,
139 SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
140 };
141
142 if metadata.alphabet == AlphabetType::Ascii {
144 return String::from_utf8(data.clone()).ok();
145 }
146
147 let alphabet = lookup_alphabet(&metadata.alphabet);
151
152 if data.len() == metadata.length {
154 if let Ok(raw_string) = String::from_utf8(data.clone()) {
156 return Some(raw_string);
158 }
159 }
160
161 let decoded_bytes = decode_substring_from_bytes(data, 0, metadata.length, alphabet);
163
164 String::from_utf8(decoded_bytes).ok()
166 }
167}
168
169impl Display for SequenceRecord {
170 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171 write!(
172 f,
173 "SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})",
174 &self.metadata().name,
175 &self.metadata().length,
176 &self.metadata().alphabet,
177 &self.metadata().sha512t24u,
178 &self.metadata().md5
179 )?;
180 Ok(())
181 }
182}
183
184#[derive(Debug, Serialize, Deserialize, Clone)]
186pub struct SeqColDigestLvl1 {
187 pub sequences_digest: String,
188 pub names_digest: String,
189 pub lengths_digest: String,
190}
191
192impl SeqColDigestLvl1 {
193 pub fn to_digest(&self) -> String {
195 let mut lvl1_object = serde_json::Map::new();
197 lvl1_object.insert(
198 "names".to_string(),
199 serde_json::Value::String(self.names_digest.clone()),
200 );
201 lvl1_object.insert(
202 "sequences".to_string(),
203 serde_json::Value::String(self.sequences_digest.clone()),
204 );
205
206 let lvl1_json = serde_json::Value::Object(lvl1_object);
207
208 let lvl1_canonical = canonicalize_json(&lvl1_json);
210 sha512t24u(lvl1_canonical.as_bytes())
211 }
212
213 pub fn from_metadata(metadata_vec: &[&SequenceMetadata]) -> Self {
215 use serde_json::Value;
216
217 let sequences: Vec<String> = metadata_vec
219 .iter()
220 .map(|md| format!("SQ.{}", md.sha512t24u))
221 .collect();
222 let names: Vec<&str> = metadata_vec.iter().map(|md| md.name.as_str()).collect();
223 let lengths: Vec<usize> = metadata_vec.iter().map(|md| md.length).collect();
224
225 let sequences_json = Value::Array(
227 sequences
228 .iter()
229 .map(|s| Value::String(s.to_string()))
230 .collect(),
231 );
232 let names_json = Value::Array(names.iter().map(|s| Value::String(s.to_string())).collect());
233 let lengths_json = Value::Array(
234 lengths
235 .iter()
236 .map(|l| Value::Number(serde_json::Number::from(*l)))
237 .collect(),
238 );
239
240 let sequences_canonical = canonicalize_json(&sequences_json);
242 let names_canonical = canonicalize_json(&names_json);
243 let lengths_canonical = canonicalize_json(&lengths_json);
244
245 SeqColDigestLvl1 {
247 sequences_digest: sha512t24u(sequences_canonical.as_bytes()),
248 names_digest: sha512t24u(names_canonical.as_bytes()),
249 lengths_digest: sha512t24u(lengths_canonical.as_bytes()),
250 }
251 }
252
253 pub fn compute_name_length_pairs_digest(metadata: &[&SequenceMetadata]) -> String {
259 use serde_json::Value;
260
261 let pairs: Vec<Value> = metadata
263 .iter()
264 .map(|md| {
265 let mut obj = serde_json::Map::new();
266 obj.insert(
267 "length".to_string(),
268 Value::Number(serde_json::Number::from(md.length)),
269 );
270 obj.insert("name".to_string(), Value::String(md.name.clone()));
271 Value::Object(obj)
272 })
273 .collect();
274
275 let canonical = canonicalize_json(&Value::Array(pairs));
277 sha512t24u(canonical.as_bytes())
278 }
279
280 pub fn compute_sorted_name_length_pairs_digest(metadata: &[&SequenceMetadata]) -> String {
285 use serde_json::Value;
286
287 let mut pair_digests: Vec<String> = metadata
288 .iter()
289 .map(|md| {
290 let mut obj = serde_json::Map::new();
291 obj.insert(
292 "length".to_string(),
293 Value::Number(serde_json::Number::from(md.length)),
294 );
295 obj.insert("name".to_string(), Value::String(md.name.clone()));
296 let canonical = canonicalize_json(&Value::Object(obj));
297 sha512t24u(canonical.as_bytes())
298 })
299 .collect();
300
301 pair_digests.sort();
302
303 let array_json = Value::Array(
304 pair_digests
305 .iter()
306 .map(|d| Value::String(d.clone()))
307 .collect(),
308 );
309 let canonical = canonicalize_json(&array_json);
310 sha512t24u(canonical.as_bytes())
311 }
312
313 pub fn compute_sorted_sequences_digest(metadata: &[&SequenceMetadata]) -> String {
318 use serde_json::Value;
319
320 let mut sequences: Vec<String> = metadata
321 .iter()
322 .map(|md| format!("SQ.{}", md.sha512t24u))
323 .collect();
324
325 sequences.sort();
326
327 let array_json = Value::Array(
328 sequences
329 .iter()
330 .map(|s| Value::String(s.clone()))
331 .collect(),
332 );
333 let canonical = canonicalize_json(&array_json);
334 sha512t24u(canonical.as_bytes())
335 }
336}
337
338#[derive(Clone, Debug, Serialize, Deserialize)]
341pub struct SequenceCollectionMetadata {
342 pub digest: String,
344 pub n_sequences: usize,
346 pub names_digest: String,
348 pub sequences_digest: String,
350 pub lengths_digest: String,
352 #[serde(default, skip_serializing_if = "Option::is_none")]
354 pub name_length_pairs_digest: Option<String>,
355 #[serde(default, skip_serializing_if = "Option::is_none")]
357 pub sorted_name_length_pairs_digest: Option<String>,
358 #[serde(default, skip_serializing_if = "Option::is_none")]
360 pub sorted_sequences_digest: Option<String>,
361 pub file_path: Option<PathBuf>,
363}
364
365impl SequenceCollectionMetadata {
366 pub fn from_sequences(
368 sequences: &[SequenceRecord],
369 file_path: Option<PathBuf>,
370 ) -> Self {
371 let metadata_refs: Vec<&SequenceMetadata> =
373 sequences.iter().map(|r| r.metadata()).collect();
374
375 let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs);
377
378 let digest = lvl1.to_digest();
380
381 Self {
382 digest,
383 n_sequences: sequences.len(),
384 names_digest: lvl1.names_digest,
385 sequences_digest: lvl1.sequences_digest,
386 lengths_digest: lvl1.lengths_digest,
387 name_length_pairs_digest: None,
388 sorted_name_length_pairs_digest: None,
389 sorted_sequences_digest: None,
390 file_path,
391 }
392 }
393
394 pub fn compute_ancillary_digests(&mut self, sequences: &[SequenceRecord]) {
397 if self.name_length_pairs_digest.is_some() {
398 return;
399 }
400 let metadata_refs: Vec<&SequenceMetadata> =
401 sequences.iter().map(|r| r.metadata()).collect();
402 self.name_length_pairs_digest =
403 Some(SeqColDigestLvl1::compute_name_length_pairs_digest(&metadata_refs));
404 self.sorted_name_length_pairs_digest =
405 Some(SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&metadata_refs));
406 self.sorted_sequences_digest =
407 Some(SeqColDigestLvl1::compute_sorted_sequences_digest(&metadata_refs));
408 }
409
410 pub fn from_collection(collection: &SequenceCollection) -> Self {
412 collection.metadata.clone()
413 }
414
415 pub fn to_lvl1(&self) -> SeqColDigestLvl1 {
417 SeqColDigestLvl1 {
418 sequences_digest: self.sequences_digest.clone(),
419 names_digest: self.names_digest.clone(),
420 lengths_digest: self.lengths_digest.clone(),
421 }
422 }
423
424 pub fn to_level1(&self) -> CollectionLevel1 {
426 CollectionLevel1 {
427 names: self.names_digest.clone(),
428 lengths: self.lengths_digest.clone(),
429 sequences: self.sequences_digest.clone(),
430 name_length_pairs: self.name_length_pairs_digest.clone(),
431 sorted_name_length_pairs: self.sorted_name_length_pairs_digest.clone(),
432 sorted_sequences: self.sorted_sequences_digest.clone(),
433 }
434 }
435}
436
437#[derive(Debug, Clone, Serialize, Deserialize)]
439pub struct CollectionLevel1 {
440 pub names: String,
441 pub lengths: String,
442 pub sequences: String,
443 #[serde(skip_serializing_if = "Option::is_none")]
444 pub name_length_pairs: Option<String>,
445 #[serde(skip_serializing_if = "Option::is_none")]
446 pub sorted_name_length_pairs: Option<String>,
447 #[serde(skip_serializing_if = "Option::is_none")]
448 pub sorted_sequences: Option<String>,
449}
450
451#[derive(Debug, Clone, Serialize, Deserialize)]
454pub struct CollectionLevel2 {
455 pub names: Vec<String>,
456 pub lengths: Vec<usize>,
457 pub sequences: Vec<String>,
459}
460
461#[derive(Debug, Clone, Serialize, Deserialize)]
463pub struct SeqColComparison {
464 pub digests: ComparisonDigests,
465 pub attributes: AttributeComparison,
466 pub array_elements: ArrayElementComparison,
467}
468
469#[derive(Debug, Clone, Serialize, Deserialize)]
473pub struct ComparisonDigests {
474 pub a: String,
475 pub b: Option<String>,
476}
477
478#[derive(Debug, Clone, Serialize, Deserialize)]
480pub struct AttributeComparison {
481 pub a_only: Vec<String>,
482 pub b_only: Vec<String>,
483 pub a_and_b: Vec<String>,
484}
485
486#[derive(Debug, Clone, Serialize, Deserialize)]
488pub struct ArrayElementComparison {
489 pub a_count: HashMap<String, usize>,
490 pub b_count: HashMap<String, usize>,
491 pub a_and_b_count: HashMap<String, usize>,
492 pub a_and_b_same_order: HashMap<String, Option<bool>>,
493}
494
495#[derive(Clone, Debug)]
497pub struct SequenceCollection {
498 pub metadata: SequenceCollectionMetadata,
500
501 pub sequences: Vec<SequenceRecord>,
504}
505
506impl SequenceCollection {
507 pub fn from_records(records: Vec<SequenceRecord>) -> Self {
509 let metadata = SequenceCollectionMetadata::from_sequences(&records, None);
511
512 SequenceCollection {
513 metadata,
514 sequences: records,
515 }
516 }
517
518 pub fn to_level2(&self) -> CollectionLevel2 {
521 let names: Vec<String> = self
522 .sequences
523 .iter()
524 .map(|r| r.metadata().name.clone())
525 .collect();
526 let lengths: Vec<usize> = self.sequences.iter().map(|r| r.metadata().length).collect();
527 let sequences: Vec<String> = self
528 .sequences
529 .iter()
530 .map(|r| format!("SQ.{}", r.metadata().sha512t24u))
531 .collect();
532
533 CollectionLevel2 {
534 names,
535 lengths,
536 sequences,
537 }
538 }
539
540 pub fn build_sorted_sequences(&self) -> Vec<String> {
542 let mut seqs: Vec<String> = self
543 .sequences
544 .iter()
545 .map(|r| format!("SQ.{}", r.metadata().sha512t24u))
546 .collect();
547 seqs.sort();
548 seqs
549 }
550
551 pub fn build_name_length_pairs(&self) -> Vec<serde_json::Value> {
554 self.sequences
555 .iter()
556 .map(|r| {
557 let md = r.metadata();
558 let mut obj = serde_json::Map::new();
559 obj.insert(
560 "length".to_string(),
561 serde_json::Value::Number(serde_json::Number::from(md.length)),
562 );
563 obj.insert(
564 "name".to_string(),
565 serde_json::Value::String(md.name.clone()),
566 );
567 serde_json::Value::Object(obj)
568 })
569 .collect()
570 }
571
572 pub fn build_sorted_name_length_pairs(&self) -> Vec<serde_json::Value> {
575 let mut pairs_with_digests: Vec<(String, serde_json::Value)> = self
576 .sequences
577 .iter()
578 .map(|r| {
579 let md = r.metadata();
580 let mut obj = serde_json::Map::new();
581 obj.insert(
582 "length".to_string(),
583 serde_json::Value::Number(serde_json::Number::from(md.length)),
584 );
585 obj.insert(
586 "name".to_string(),
587 serde_json::Value::String(md.name.clone()),
588 );
589 let val = serde_json::Value::Object(obj);
590 let digest = sha512t24u(canonicalize_json(&val).as_bytes());
591 (digest, val)
592 })
593 .collect();
594 pairs_with_digests.sort_by(|a, b| a.0.cmp(&b.0));
595 pairs_with_digests.into_iter().map(|(_, v)| v).collect()
596 }
597
598 pub fn compare(&self, other: &SequenceCollection) -> SeqColComparison {
603 let arrays_a = self.to_comparison_arrays();
604 let arrays_b = other.to_comparison_arrays();
605 compare_arrays(
606 arrays_a,
607 arrays_b,
608 self.metadata.digest.clone(),
609 Some(other.metadata.digest.clone()),
610 )
611 }
612
613 pub(crate) fn to_comparison_arrays(&self) -> HashMap<String, Vec<String>> {
615 let mut map = HashMap::new();
616
617 map.insert(
619 "names".to_string(),
620 self.sequences.iter().map(|r| r.metadata().name.clone()).collect(),
621 );
622 map.insert(
623 "lengths".to_string(),
624 self.sequences.iter().map(|r| r.metadata().length.to_string()).collect(),
625 );
626 map.insert(
627 "sequences".to_string(),
628 self.sequences.iter().map(|r| format!("SQ.{}", r.metadata().sha512t24u)).collect(),
629 );
630
631 if self.metadata.sorted_sequences_digest.is_some() {
633 let mut sorted_seqs: Vec<String> = self.sequences
635 .iter()
636 .map(|r| format!("SQ.{}", r.metadata().sha512t24u))
637 .collect();
638 sorted_seqs.sort();
639 map.insert("sorted_sequences".to_string(), sorted_seqs);
640 }
641
642 if self.metadata.name_length_pairs_digest.is_some() {
643 let nlp: Vec<String> = self.sequences
645 .iter()
646 .map(|r| {
647 let md = r.metadata();
648 let mut obj = serde_json::Map::new();
649 obj.insert("length".to_string(), serde_json::Value::Number(serde_json::Number::from(md.length)));
650 obj.insert("name".to_string(), serde_json::Value::String(md.name.clone()));
651 canonicalize_json(&serde_json::Value::Object(obj))
652 })
653 .collect();
654 map.insert("name_length_pairs".to_string(), nlp);
655 }
656
657 if self.metadata.sorted_name_length_pairs_digest.is_some() {
658 let mut snlp: Vec<String> = self.sequences
660 .iter()
661 .map(|r| {
662 let md = r.metadata();
663 let mut obj = serde_json::Map::new();
664 obj.insert("length".to_string(), serde_json::Value::Number(serde_json::Number::from(md.length)));
665 obj.insert("name".to_string(), serde_json::Value::String(md.name.clone()));
666 sha512t24u(canonicalize_json(&serde_json::Value::Object(obj)).as_bytes())
667 })
668 .collect();
669 snlp.sort();
670 map.insert("sorted_name_length_pairs".to_string(), snlp);
671 }
672
673 map
674 }
675}
676
677fn compare_elements(a: &[String], b: &[String]) -> (usize, Option<bool>) {
686 use std::collections::HashSet;
687
688 let set_a: HashSet<&str> = a.iter().map(|s| s.as_str()).collect();
689 let set_b: HashSet<&str> = b.iter().map(|s| s.as_str()).collect();
690
691 let filtered_a: Vec<&str> = a.iter().filter(|x| set_b.contains(x.as_str())).map(|s| s.as_str()).collect();
693 let filtered_b: Vec<&str> = b.iter().filter(|x| set_a.contains(x.as_str())).map(|s| s.as_str()).collect();
694
695 let overlap = filtered_a.len().min(filtered_b.len());
696
697 let same_order = if overlap < 2 {
698 None
699 } else if filtered_a.len() != filtered_b.len() || filtered_a.len() != overlap {
700 None
702 } else {
703 Some(filtered_a == filtered_b)
704 };
705
706 (overlap, same_order)
707}
708
709pub(crate) fn compare_arrays(
712 arrays_a: HashMap<String, Vec<String>>,
713 arrays_b: HashMap<String, Vec<String>>,
714 digest_a: String,
715 digest_b: Option<String>,
716) -> SeqColComparison {
717 let a_keys: std::collections::BTreeSet<&str> = arrays_a.keys().map(|s| s.as_str()).collect();
718 let b_keys: std::collections::BTreeSet<&str> = arrays_b.keys().map(|s| s.as_str()).collect();
719
720 let mut a_only = Vec::new();
721 let mut b_only = Vec::new();
722 let mut a_and_b = Vec::new();
723
724 let mut all_keys: Vec<&str> = a_keys.union(&b_keys).copied().collect();
725 all_keys.sort();
726
727 for key in all_keys {
728 let in_a = a_keys.contains(key);
729 let in_b = b_keys.contains(key);
730 match (in_a, in_b) {
731 (true, true) => a_and_b.push(key.to_string()),
732 (true, false) => a_only.push(key.to_string()),
733 (false, true) => b_only.push(key.to_string()),
734 (false, false) => unreachable!(),
735 }
736 }
737
738 let mut a_count = HashMap::new();
739 let mut b_count = HashMap::new();
740 let mut a_and_b_count = HashMap::new();
741 let mut a_and_b_same_order = HashMap::new();
742
743 for (k, v) in &arrays_a {
744 a_count.insert(k.clone(), v.len());
745 }
746 for (k, v) in &arrays_b {
747 b_count.insert(k.clone(), v.len());
748 }
749
750 for attr in &a_and_b {
751 let arr_a = arrays_a.get(attr).unwrap();
752 let arr_b = arrays_b.get(attr).unwrap();
753 let (overlap, same_order) = compare_elements(arr_a, arr_b);
754 a_and_b_count.insert(attr.clone(), overlap);
755 a_and_b_same_order.insert(attr.clone(), same_order);
756 }
757
758 SeqColComparison {
759 digests: ComparisonDigests {
760 a: digest_a,
761 b: digest_b,
762 },
763 attributes: AttributeComparison {
764 a_only,
765 b_only,
766 a_and_b,
767 },
768 array_elements: ArrayElementComparison {
769 a_count,
770 b_count,
771 a_and_b_count,
772 a_and_b_same_order,
773 },
774 }
775}
776
777pub(crate) fn level2_to_comparison_arrays(level2: &CollectionLevel2) -> HashMap<String, Vec<String>> {
781 let mut map = HashMap::new();
782 map.insert("names".to_string(), level2.names.clone());
783 map.insert(
784 "lengths".to_string(),
785 level2.lengths.iter().map(|l| l.to_string()).collect(),
786 );
787 map.insert("sequences".to_string(), level2.sequences.clone());
788 map
789}
790
791impl Display for SequenceCollection {
792 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
793 write!(
794 f,
795 "SequenceCollection with {} sequences, digest: {}",
796 self.sequences.len(),
797 self.metadata.digest
798 )?;
799 write!(f, "\nFirst 3 sequences:")?;
800 for seqrec in self.sequences.iter().take(3) {
801 write!(f, "\n- {}", seqrec)?;
802 }
803 Ok(())
804 }
805}
806
807impl<'a> IntoIterator for &'a SequenceCollection {
810 type Item = &'a SequenceRecord;
811 type IntoIter = std::slice::Iter<'a, SequenceRecord>;
812
813 fn into_iter(self) -> Self::IntoIter {
814 self.sequences.iter()
815 }
816}
817
818impl IntoIterator for SequenceCollection {
821 type Item = SequenceRecord;
822 type IntoIter = std::vec::IntoIter<SequenceRecord>;
823
824 fn into_iter(self) -> Self::IntoIter {
825 self.sequences.into_iter()
826 }
827}
828
829#[derive(Clone, Debug)]
832pub enum SequenceCollectionRecord {
833 Stub(SequenceCollectionMetadata),
835 Full {
837 metadata: SequenceCollectionMetadata,
838 sequences: Vec<SequenceRecord>,
839 },
840}
841
842impl SequenceCollectionRecord {
843 pub fn metadata(&self) -> &SequenceCollectionMetadata {
845 match self {
846 SequenceCollectionRecord::Stub(meta) => meta,
847 SequenceCollectionRecord::Full { metadata, .. } => metadata,
848 }
849 }
850
851 pub fn sequences(&self) -> Option<&[SequenceRecord]> {
853 match self {
854 SequenceCollectionRecord::Stub(_) => None,
855 SequenceCollectionRecord::Full { sequences, .. } => Some(sequences),
856 }
857 }
858
859 pub fn has_sequences(&self) -> bool {
861 matches!(self, SequenceCollectionRecord::Full { .. })
862 }
863
864 pub fn with_sequences(self, sequences: Vec<SequenceRecord>) -> Self {
866 let metadata = match self {
867 SequenceCollectionRecord::Stub(m) => m,
868 SequenceCollectionRecord::Full { metadata, .. } => metadata,
869 };
870 SequenceCollectionRecord::Full {
871 metadata,
872 sequences,
873 }
874 }
875
876 pub fn to_collection(&self) -> SequenceCollection {
878 match self {
879 SequenceCollectionRecord::Stub(meta) => {
880 SequenceCollection {
882 metadata: meta.clone(),
883 sequences: Vec::new(),
884 }
885 }
886 SequenceCollectionRecord::Full {
887 metadata,
888 sequences,
889 } => SequenceCollection {
890 metadata: metadata.clone(),
891 sequences: sequences.clone(),
892 },
893 }
894 }
895}
896
897impl From<SequenceCollection> for SequenceCollectionRecord {
898 fn from(collection: SequenceCollection) -> Self {
899 SequenceCollectionRecord::Full {
900 metadata: collection.metadata,
901 sequences: collection.sequences,
902 }
903 }
904}
905
906pub fn digest_sequence(name: &str, data: &[u8]) -> SequenceRecord {
933 let uppercased: Vec<u8> = data.iter().map(|b| b.to_ascii_uppercase()).collect();
935
936 let metadata = SequenceMetadata {
937 name: name.to_string(),
938 description: None,
939 length: data.len(),
940 sha512t24u: sha512t24u(&uppercased),
941 md5: md5(&uppercased),
942 alphabet: guess_alphabet(&uppercased),
943 fai: None, };
945 SequenceRecord::Full {
946 metadata,
947 sequence: uppercased,
948 }
949}
950
951pub fn digest_sequence_with_description(
963 name: &str,
964 description: Option<&str>,
965 data: &[u8],
966) -> SequenceRecord {
967 let mut seq = digest_sequence(name, data);
968 if let SequenceRecord::Full {
969 ref mut metadata, ..
970 } = seq
971 {
972 metadata.description = description.map(String::from);
973 }
974 seq
975}
976
977pub fn parse_rgsi_line(line: &str) -> Option<SequenceMetadata> {
985 if line.trim().is_empty() {
987 return None;
988 }
989
990 let parts: Vec<&str> = line.split('\t').collect();
991
992 match parts.len() {
993 5 => Some(SequenceMetadata {
995 name: parts[0].to_string(),
996 description: None,
997 length: parts[1].parse().ok()?,
998 alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
999 sha512t24u: parts[3].to_string(),
1000 md5: parts[4].to_string(),
1001 fai: None,
1002 }),
1003 6 => Some(SequenceMetadata {
1005 name: parts[0].to_string(),
1006 description: if parts[5].is_empty() {
1007 None
1008 } else {
1009 Some(parts[5].to_string())
1010 },
1011 length: parts[1].parse().ok()?,
1012 alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
1013 sha512t24u: parts[3].to_string(),
1014 md5: parts[4].to_string(),
1015 fai: None,
1016 }),
1017 _ => None,
1018 }
1019}
1020
1021pub fn parse_rgci_line(line: &str) -> Option<SequenceCollectionMetadata> {
1031 if line.starts_with('#') {
1032 return None;
1033 }
1034 let parts: Vec<&str> = line.split('\t').collect();
1035 if parts.len() < 5 {
1036 return None;
1037 }
1038 let opt_col = |i: usize| -> Option<String> {
1040 parts.get(i).and_then(|s| {
1041 if s.is_empty() { None } else { Some(s.to_string()) }
1042 })
1043 };
1044 Some(SequenceCollectionMetadata {
1045 digest: parts[0].to_string(),
1046 n_sequences: parts[1].parse().ok()?,
1047 names_digest: parts[2].to_string(),
1048 sequences_digest: parts[3].to_string(),
1049 lengths_digest: parts[4].to_string(),
1050 name_length_pairs_digest: opt_col(5),
1051 sorted_name_length_pairs_digest: opt_col(6),
1052 sorted_sequences_digest: opt_col(7),
1053 file_path: None,
1054 })
1055}
1056
1057#[cfg(test)]
1058mod tests {
1059 use super::*;
1060
1061 fn test_metadata() -> Vec<SequenceMetadata> {
1062 vec![
1063 SequenceMetadata {
1064 name: "chrX".to_string(),
1065 description: None,
1066 length: 8,
1067 sha512t24u: "abc123".to_string(),
1068 md5: "md5abc".to_string(),
1069 alphabet: AlphabetType::Dna2bit,
1070 fai: None,
1071 },
1072 SequenceMetadata {
1073 name: "chr1".to_string(),
1074 description: None,
1075 length: 4,
1076 sha512t24u: "def456".to_string(),
1077 md5: "md5def".to_string(),
1078 alphabet: AlphabetType::Dna2bit,
1079 fai: None,
1080 },
1081 ]
1082 }
1083
1084 #[test]
1085 fn test_ancillary_digest_nlp() {
1086 let metadata = test_metadata();
1087 let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1088 let nlp = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs);
1089 assert!(!nlp.is_empty());
1090 assert_eq!(nlp.len(), 32); }
1092
1093 #[test]
1094 fn test_ancillary_digest_snlp() {
1095 let metadata = test_metadata();
1096 let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1097 let snlp = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs);
1098 assert!(!snlp.is_empty());
1099 assert_eq!(snlp.len(), 32);
1100 }
1101
1102 #[test]
1103 fn test_ancillary_digest_sorted_sequences() {
1104 let metadata = test_metadata();
1105 let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1106 let ss = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs);
1107 assert!(!ss.is_empty());
1108 assert_eq!(ss.len(), 32);
1109 }
1110
1111 #[test]
1112 fn test_nlp_and_snlp_both_valid() {
1113 let metadata = test_metadata();
1114 let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
1115 let nlp = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs);
1116 let snlp = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs);
1117 assert_eq!(nlp.len(), 32);
1119 assert_eq!(snlp.len(), 32);
1120 }
1124
1125 #[test]
1126 fn test_snlp_order_invariant() {
1127 let metadata = test_metadata();
1128 let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1130
1131 let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
1132 let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
1133
1134 let snlp1 = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs1);
1135 let snlp2 = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs2);
1136
1137 assert_eq!(snlp1, snlp2);
1139 }
1140
1141 #[test]
1142 fn test_sorted_sequences_order_invariant() {
1143 let metadata = test_metadata();
1144 let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1145
1146 let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
1147 let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
1148
1149 let ss1 = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs1);
1150 let ss2 = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs2);
1151
1152 assert_eq!(ss1, ss2);
1154 }
1155
1156 #[test]
1157 fn test_nlp_order_sensitive() {
1158 let metadata = test_metadata();
1159 let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1160
1161 let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
1162 let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
1163
1164 let nlp1 = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs1);
1165 let nlp2 = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs2);
1166
1167 assert_ne!(nlp1, nlp2);
1169 }
1170
1171 #[test]
1172 fn test_to_level1() {
1173 let metadata = test_metadata();
1174 let records: Vec<_> = metadata
1175 .iter()
1176 .map(|m| SequenceRecord::Stub(m.clone()))
1177 .collect();
1178 let mut coll_meta = SequenceCollectionMetadata::from_sequences(&records, None);
1179 coll_meta.compute_ancillary_digests(&records);
1180
1181 let lvl1 = coll_meta.to_level1();
1182 assert_eq!(lvl1.names, coll_meta.names_digest);
1183 assert_eq!(lvl1.lengths, coll_meta.lengths_digest);
1184 assert_eq!(lvl1.sequences, coll_meta.sequences_digest);
1185 assert!(lvl1.name_length_pairs.is_some());
1186 assert!(lvl1.sorted_name_length_pairs.is_some());
1187 assert!(lvl1.sorted_sequences.is_some());
1188 }
1189
1190 #[test]
1191 fn test_to_level2() {
1192 let metadata = test_metadata();
1193 let records: Vec<SequenceRecord> = metadata
1194 .iter()
1195 .map(|m| SequenceRecord::Stub(m.clone()))
1196 .collect();
1197 let collection = SequenceCollection::from_records(records);
1198
1199 let lvl2 = collection.to_level2();
1200 assert_eq!(lvl2.names, vec!["chrX", "chr1"]);
1201 assert_eq!(lvl2.lengths, vec![8, 4]);
1202 assert_eq!(lvl2.sequences.len(), 2);
1203 assert!(lvl2.sequences[0].starts_with("SQ."));
1204 }
1205
1206 #[test]
1207 fn test_compare_same() {
1208 let records: Vec<SequenceRecord> = test_metadata().into_iter().map(SequenceRecord::Stub).collect();
1209 let collection = SequenceCollection::from_records(records);
1210 let result = collection.compare(&collection);
1211
1212 assert_eq!(Some(result.digests.a.as_str()), result.digests.b.as_deref());
1213 assert_eq!(result.attributes.a_and_b.len(), 3); for attr in &result.attributes.a_and_b {
1215 assert_eq!(result.array_elements.a_and_b_same_order[attr], Some(true));
1216 }
1217 }
1218
1219 #[test]
1220 fn test_compare_reversed_order() {
1221 let metadata = test_metadata();
1222 let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
1223 let coll_a = SequenceCollection::from_records(metadata.into_iter().map(SequenceRecord::Stub).collect());
1224 let coll_b = SequenceCollection::from_records(reversed.into_iter().map(SequenceRecord::Stub).collect());
1225
1226 let result = coll_a.compare(&coll_b);
1227 for attr in &result.attributes.a_and_b {
1228 assert_eq!(result.array_elements.a_and_b_count[attr], 2);
1229 assert_eq!(result.array_elements.a_and_b_same_order[attr], Some(false));
1230 }
1231 }
1232
1233 #[test]
1234 fn test_compare_single_element() {
1235 let meta = SequenceMetadata {
1236 name: "chr1".to_string(),
1237 description: None,
1238 length: 4,
1239 sha512t24u: "abc".to_string(),
1240 md5: "md5".to_string(),
1241 alphabet: AlphabetType::Dna2bit,
1242 fai: None,
1243 };
1244 let coll = SequenceCollection::from_records(vec![SequenceRecord::Stub(meta)]);
1245 let result = coll.compare(&coll);
1246
1247 for attr in &result.attributes.a_and_b {
1249 assert_eq!(result.array_elements.a_and_b_same_order[attr], None);
1250 }
1251 }
1252}