use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt::Display;
use std::path::PathBuf;
use super::algorithms::{canonicalize_json, md5, sha512t24u};
use super::alphabet::{AlphabetType, guess_alphabet};
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SequenceMetadata {
pub name: String,
#[serde(default)]
pub description: Option<String>,
pub length: usize,
pub sha512t24u: String,
pub md5: String,
pub alphabet: AlphabetType,
pub fai: Option<FaiMetadata>,
}
impl Default for SequenceMetadata {
fn default() -> Self {
Self {
name: String::new(),
description: None,
length: 0,
sha512t24u: String::new(),
md5: String::new(),
alphabet: AlphabetType::Ascii,
fai: None,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct FaiMetadata {
pub offset: u64, pub line_bases: u32, pub line_bytes: u32, }
#[derive(Clone, Debug)]
pub enum SequenceRecord {
Stub(SequenceMetadata),
Full {
metadata: SequenceMetadata,
sequence: Vec<u8>,
},
}
impl SequenceRecord {
pub fn metadata(&self) -> &SequenceMetadata {
match self {
SequenceRecord::Stub(meta) => meta,
SequenceRecord::Full { metadata, .. } => metadata,
}
}
pub fn sequence(&self) -> Option<&[u8]> {
match self {
SequenceRecord::Stub(_) => None,
SequenceRecord::Full { sequence, .. } => Some(sequence),
}
}
pub fn is_loaded(&self) -> bool {
matches!(self, SequenceRecord::Full { .. })
}
pub fn with_data(self, sequence: Vec<u8>) -> Self {
let metadata = match self {
SequenceRecord::Stub(m) => m,
SequenceRecord::Full { metadata, .. } => metadata,
};
SequenceRecord::Full { metadata, sequence }
}
pub fn load_data(&mut self, sequence: Vec<u8>) {
match self {
SequenceRecord::Stub(metadata) => {
let metadata = std::mem::take(metadata);
*self = SequenceRecord::Full { metadata, sequence };
}
SequenceRecord::Full {
sequence: existing, ..
} => {
*existing = sequence;
}
}
}
pub fn decode(&self) -> Option<String> {
use super::alphabet::lookup_alphabet;
use super::encoder::decode_substring_from_bytes;
let (metadata, data) = match self {
SequenceRecord::Stub(_) => return None,
SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
};
if metadata.alphabet == AlphabetType::Ascii {
return String::from_utf8(data.clone()).ok();
}
let alphabet = lookup_alphabet(&metadata.alphabet);
if data.len() == metadata.length {
if let Ok(raw_string) = String::from_utf8(data.clone()) {
return Some(raw_string);
}
}
let decoded_bytes = decode_substring_from_bytes(data, 0, metadata.length, alphabet);
String::from_utf8(decoded_bytes).ok()
}
}
impl Display for SequenceRecord {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})",
&self.metadata().name,
&self.metadata().length,
&self.metadata().alphabet,
&self.metadata().sha512t24u,
&self.metadata().md5
)?;
Ok(())
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct SeqColDigestLvl1 {
pub sequences_digest: String,
pub names_digest: String,
pub lengths_digest: String,
}
impl SeqColDigestLvl1 {
pub fn to_digest(&self) -> String {
let mut lvl1_object = serde_json::Map::new();
lvl1_object.insert(
"names".to_string(),
serde_json::Value::String(self.names_digest.clone()),
);
lvl1_object.insert(
"sequences".to_string(),
serde_json::Value::String(self.sequences_digest.clone()),
);
let lvl1_json = serde_json::Value::Object(lvl1_object);
let lvl1_canonical = canonicalize_json(&lvl1_json);
sha512t24u(lvl1_canonical.as_bytes())
}
pub fn from_metadata(metadata_vec: &[&SequenceMetadata]) -> Self {
use serde_json::Value;
let sequences: Vec<String> = metadata_vec
.iter()
.map(|md| format!("SQ.{}", md.sha512t24u))
.collect();
let names: Vec<&str> = metadata_vec.iter().map(|md| md.name.as_str()).collect();
let lengths: Vec<usize> = metadata_vec.iter().map(|md| md.length).collect();
let sequences_json = Value::Array(
sequences
.iter()
.map(|s| Value::String(s.to_string()))
.collect(),
);
let names_json = Value::Array(names.iter().map(|s| Value::String(s.to_string())).collect());
let lengths_json = Value::Array(
lengths
.iter()
.map(|l| Value::Number(serde_json::Number::from(*l)))
.collect(),
);
let sequences_canonical = canonicalize_json(&sequences_json);
let names_canonical = canonicalize_json(&names_json);
let lengths_canonical = canonicalize_json(&lengths_json);
SeqColDigestLvl1 {
sequences_digest: sha512t24u(sequences_canonical.as_bytes()),
names_digest: sha512t24u(names_canonical.as_bytes()),
lengths_digest: sha512t24u(lengths_canonical.as_bytes()),
}
}
pub fn compute_name_length_pairs_digest(metadata: &[&SequenceMetadata]) -> String {
use serde_json::Value;
let pairs: Vec<Value> = metadata
.iter()
.map(|md| {
let mut obj = serde_json::Map::new();
obj.insert(
"length".to_string(),
Value::Number(serde_json::Number::from(md.length)),
);
obj.insert("name".to_string(), Value::String(md.name.clone()));
Value::Object(obj)
})
.collect();
let canonical = canonicalize_json(&Value::Array(pairs));
sha512t24u(canonical.as_bytes())
}
pub fn compute_sorted_name_length_pairs_digest(metadata: &[&SequenceMetadata]) -> String {
use serde_json::Value;
let mut pair_digests: Vec<String> = metadata
.iter()
.map(|md| {
let mut obj = serde_json::Map::new();
obj.insert(
"length".to_string(),
Value::Number(serde_json::Number::from(md.length)),
);
obj.insert("name".to_string(), Value::String(md.name.clone()));
let canonical = canonicalize_json(&Value::Object(obj));
sha512t24u(canonical.as_bytes())
})
.collect();
pair_digests.sort();
let array_json = Value::Array(
pair_digests
.iter()
.map(|d| Value::String(d.clone()))
.collect(),
);
let canonical = canonicalize_json(&array_json);
sha512t24u(canonical.as_bytes())
}
pub fn compute_sorted_sequences_digest(metadata: &[&SequenceMetadata]) -> String {
use serde_json::Value;
let mut sequences: Vec<String> = metadata
.iter()
.map(|md| format!("SQ.{}", md.sha512t24u))
.collect();
sequences.sort();
let array_json = Value::Array(
sequences
.iter()
.map(|s| Value::String(s.clone()))
.collect(),
);
let canonical = canonicalize_json(&array_json);
sha512t24u(canonical.as_bytes())
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SequenceCollectionMetadata {
pub digest: String,
pub n_sequences: usize,
pub names_digest: String,
pub sequences_digest: String,
pub lengths_digest: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub name_length_pairs_digest: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub sorted_name_length_pairs_digest: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub sorted_sequences_digest: Option<String>,
pub file_path: Option<PathBuf>,
}
impl SequenceCollectionMetadata {
pub fn from_sequences(
sequences: &[SequenceRecord],
file_path: Option<PathBuf>,
) -> Self {
let metadata_refs: Vec<&SequenceMetadata> =
sequences.iter().map(|r| r.metadata()).collect();
let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs);
let digest = lvl1.to_digest();
Self {
digest,
n_sequences: sequences.len(),
names_digest: lvl1.names_digest,
sequences_digest: lvl1.sequences_digest,
lengths_digest: lvl1.lengths_digest,
name_length_pairs_digest: None,
sorted_name_length_pairs_digest: None,
sorted_sequences_digest: None,
file_path,
}
}
pub fn compute_ancillary_digests(&mut self, sequences: &[SequenceRecord]) {
if self.name_length_pairs_digest.is_some() {
return;
}
let metadata_refs: Vec<&SequenceMetadata> =
sequences.iter().map(|r| r.metadata()).collect();
self.name_length_pairs_digest =
Some(SeqColDigestLvl1::compute_name_length_pairs_digest(&metadata_refs));
self.sorted_name_length_pairs_digest =
Some(SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&metadata_refs));
self.sorted_sequences_digest =
Some(SeqColDigestLvl1::compute_sorted_sequences_digest(&metadata_refs));
}
pub fn from_collection(collection: &SequenceCollection) -> Self {
collection.metadata.clone()
}
pub fn to_lvl1(&self) -> SeqColDigestLvl1 {
SeqColDigestLvl1 {
sequences_digest: self.sequences_digest.clone(),
names_digest: self.names_digest.clone(),
lengths_digest: self.lengths_digest.clone(),
}
}
pub fn to_level1(&self) -> CollectionLevel1 {
CollectionLevel1 {
names: self.names_digest.clone(),
lengths: self.lengths_digest.clone(),
sequences: self.sequences_digest.clone(),
name_length_pairs: self.name_length_pairs_digest.clone(),
sorted_name_length_pairs: self.sorted_name_length_pairs_digest.clone(),
sorted_sequences: self.sorted_sequences_digest.clone(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionLevel1 {
pub names: String,
pub lengths: String,
pub sequences: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub name_length_pairs: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub sorted_name_length_pairs: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub sorted_sequences: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionLevel2 {
pub names: Vec<String>,
pub lengths: Vec<usize>,
pub sequences: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SeqColComparison {
pub digests: ComparisonDigests,
pub attributes: AttributeComparison,
pub array_elements: ArrayElementComparison,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComparisonDigests {
pub a: String,
pub b: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AttributeComparison {
pub a_only: Vec<String>,
pub b_only: Vec<String>,
pub a_and_b: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArrayElementComparison {
pub a_count: HashMap<String, usize>,
pub b_count: HashMap<String, usize>,
pub a_and_b_count: HashMap<String, usize>,
pub a_and_b_same_order: HashMap<String, Option<bool>>,
}
#[derive(Clone, Debug)]
pub struct SequenceCollection {
pub metadata: SequenceCollectionMetadata,
pub sequences: Vec<SequenceRecord>,
}
impl SequenceCollection {
pub fn from_records(records: Vec<SequenceRecord>) -> Self {
let metadata = SequenceCollectionMetadata::from_sequences(&records, None);
SequenceCollection {
metadata,
sequences: records,
}
}
pub fn to_level2(&self) -> CollectionLevel2 {
let names: Vec<String> = self
.sequences
.iter()
.map(|r| r.metadata().name.clone())
.collect();
let lengths: Vec<usize> = self.sequences.iter().map(|r| r.metadata().length).collect();
let sequences: Vec<String> = self
.sequences
.iter()
.map(|r| format!("SQ.{}", r.metadata().sha512t24u))
.collect();
CollectionLevel2 {
names,
lengths,
sequences,
}
}
pub fn build_sorted_sequences(&self) -> Vec<String> {
let mut seqs: Vec<String> = self
.sequences
.iter()
.map(|r| format!("SQ.{}", r.metadata().sha512t24u))
.collect();
seqs.sort();
seqs
}
pub fn build_name_length_pairs(&self) -> Vec<serde_json::Value> {
self.sequences
.iter()
.map(|r| {
let md = r.metadata();
let mut obj = serde_json::Map::new();
obj.insert(
"length".to_string(),
serde_json::Value::Number(serde_json::Number::from(md.length)),
);
obj.insert(
"name".to_string(),
serde_json::Value::String(md.name.clone()),
);
serde_json::Value::Object(obj)
})
.collect()
}
pub fn build_sorted_name_length_pairs(&self) -> Vec<serde_json::Value> {
let mut pairs_with_digests: Vec<(String, serde_json::Value)> = self
.sequences
.iter()
.map(|r| {
let md = r.metadata();
let mut obj = serde_json::Map::new();
obj.insert(
"length".to_string(),
serde_json::Value::Number(serde_json::Number::from(md.length)),
);
obj.insert(
"name".to_string(),
serde_json::Value::String(md.name.clone()),
);
let val = serde_json::Value::Object(obj);
let digest = sha512t24u(canonicalize_json(&val).as_bytes());
(digest, val)
})
.collect();
pairs_with_digests.sort_by(|a, b| a.0.cmp(&b.0));
pairs_with_digests.into_iter().map(|(_, v)| v).collect()
}
pub fn compare(&self, other: &SequenceCollection) -> SeqColComparison {
let arrays_a = self.to_comparison_arrays();
let arrays_b = other.to_comparison_arrays();
compare_arrays(
arrays_a,
arrays_b,
self.metadata.digest.clone(),
Some(other.metadata.digest.clone()),
)
}
pub(crate) fn to_comparison_arrays(&self) -> HashMap<String, Vec<String>> {
let mut map = HashMap::new();
map.insert(
"names".to_string(),
self.sequences.iter().map(|r| r.metadata().name.clone()).collect(),
);
map.insert(
"lengths".to_string(),
self.sequences.iter().map(|r| r.metadata().length.to_string()).collect(),
);
map.insert(
"sequences".to_string(),
self.sequences.iter().map(|r| format!("SQ.{}", r.metadata().sha512t24u)).collect(),
);
if self.metadata.sorted_sequences_digest.is_some() {
let mut sorted_seqs: Vec<String> = self.sequences
.iter()
.map(|r| format!("SQ.{}", r.metadata().sha512t24u))
.collect();
sorted_seqs.sort();
map.insert("sorted_sequences".to_string(), sorted_seqs);
}
if self.metadata.name_length_pairs_digest.is_some() {
let nlp: Vec<String> = self.sequences
.iter()
.map(|r| {
let md = r.metadata();
let mut obj = serde_json::Map::new();
obj.insert("length".to_string(), serde_json::Value::Number(serde_json::Number::from(md.length)));
obj.insert("name".to_string(), serde_json::Value::String(md.name.clone()));
canonicalize_json(&serde_json::Value::Object(obj))
})
.collect();
map.insert("name_length_pairs".to_string(), nlp);
}
if self.metadata.sorted_name_length_pairs_digest.is_some() {
let mut snlp: Vec<String> = self.sequences
.iter()
.map(|r| {
let md = r.metadata();
let mut obj = serde_json::Map::new();
obj.insert("length".to_string(), serde_json::Value::Number(serde_json::Number::from(md.length)));
obj.insert("name".to_string(), serde_json::Value::String(md.name.clone()));
sha512t24u(canonicalize_json(&serde_json::Value::Object(obj)).as_bytes())
})
.collect();
snlp.sort();
map.insert("sorted_name_length_pairs".to_string(), snlp);
}
map
}
}
fn compare_elements(a: &[String], b: &[String]) -> (usize, Option<bool>) {
use std::collections::HashSet;
let set_a: HashSet<&str> = a.iter().map(|s| s.as_str()).collect();
let set_b: HashSet<&str> = b.iter().map(|s| s.as_str()).collect();
let filtered_a: Vec<&str> = a.iter().filter(|x| set_b.contains(x.as_str())).map(|s| s.as_str()).collect();
let filtered_b: Vec<&str> = b.iter().filter(|x| set_a.contains(x.as_str())).map(|s| s.as_str()).collect();
let overlap = filtered_a.len().min(filtered_b.len());
let same_order = if overlap < 2 {
None
} else if filtered_a.len() != filtered_b.len() || filtered_a.len() != overlap {
None
} else {
Some(filtered_a == filtered_b)
};
(overlap, same_order)
}
pub(crate) fn compare_arrays(
arrays_a: HashMap<String, Vec<String>>,
arrays_b: HashMap<String, Vec<String>>,
digest_a: String,
digest_b: Option<String>,
) -> SeqColComparison {
let a_keys: std::collections::BTreeSet<&str> = arrays_a.keys().map(|s| s.as_str()).collect();
let b_keys: std::collections::BTreeSet<&str> = arrays_b.keys().map(|s| s.as_str()).collect();
let mut a_only = Vec::new();
let mut b_only = Vec::new();
let mut a_and_b = Vec::new();
let mut all_keys: Vec<&str> = a_keys.union(&b_keys).copied().collect();
all_keys.sort();
for key in all_keys {
let in_a = a_keys.contains(key);
let in_b = b_keys.contains(key);
match (in_a, in_b) {
(true, true) => a_and_b.push(key.to_string()),
(true, false) => a_only.push(key.to_string()),
(false, true) => b_only.push(key.to_string()),
(false, false) => unreachable!(),
}
}
let mut a_count = HashMap::new();
let mut b_count = HashMap::new();
let mut a_and_b_count = HashMap::new();
let mut a_and_b_same_order = HashMap::new();
for (k, v) in &arrays_a {
a_count.insert(k.clone(), v.len());
}
for (k, v) in &arrays_b {
b_count.insert(k.clone(), v.len());
}
for attr in &a_and_b {
let arr_a = arrays_a.get(attr).unwrap();
let arr_b = arrays_b.get(attr).unwrap();
let (overlap, same_order) = compare_elements(arr_a, arr_b);
a_and_b_count.insert(attr.clone(), overlap);
a_and_b_same_order.insert(attr.clone(), same_order);
}
SeqColComparison {
digests: ComparisonDigests {
a: digest_a,
b: digest_b,
},
attributes: AttributeComparison {
a_only,
b_only,
a_and_b,
},
array_elements: ArrayElementComparison {
a_count,
b_count,
a_and_b_count,
a_and_b_same_order,
},
}
}
pub(crate) fn level2_to_comparison_arrays(level2: &CollectionLevel2) -> HashMap<String, Vec<String>> {
let mut map = HashMap::new();
map.insert("names".to_string(), level2.names.clone());
map.insert(
"lengths".to_string(),
level2.lengths.iter().map(|l| l.to_string()).collect(),
);
map.insert("sequences".to_string(), level2.sequences.clone());
map
}
impl Display for SequenceCollection {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"SequenceCollection with {} sequences, digest: {}",
self.sequences.len(),
self.metadata.digest
)?;
write!(f, "\nFirst 3 sequences:")?;
for seqrec in self.sequences.iter().take(3) {
write!(f, "\n- {}", seqrec)?;
}
Ok(())
}
}
impl<'a> IntoIterator for &'a SequenceCollection {
type Item = &'a SequenceRecord;
type IntoIter = std::slice::Iter<'a, SequenceRecord>;
fn into_iter(self) -> Self::IntoIter {
self.sequences.iter()
}
}
impl IntoIterator for SequenceCollection {
type Item = SequenceRecord;
type IntoIter = std::vec::IntoIter<SequenceRecord>;
fn into_iter(self) -> Self::IntoIter {
self.sequences.into_iter()
}
}
#[derive(Clone, Debug)]
pub enum SequenceCollectionRecord {
Stub(SequenceCollectionMetadata),
Full {
metadata: SequenceCollectionMetadata,
sequences: Vec<SequenceRecord>,
},
}
impl SequenceCollectionRecord {
pub fn metadata(&self) -> &SequenceCollectionMetadata {
match self {
SequenceCollectionRecord::Stub(meta) => meta,
SequenceCollectionRecord::Full { metadata, .. } => metadata,
}
}
pub fn sequences(&self) -> Option<&[SequenceRecord]> {
match self {
SequenceCollectionRecord::Stub(_) => None,
SequenceCollectionRecord::Full { sequences, .. } => Some(sequences),
}
}
pub fn has_sequences(&self) -> bool {
matches!(self, SequenceCollectionRecord::Full { .. })
}
pub fn with_sequences(self, sequences: Vec<SequenceRecord>) -> Self {
let metadata = match self {
SequenceCollectionRecord::Stub(m) => m,
SequenceCollectionRecord::Full { metadata, .. } => metadata,
};
SequenceCollectionRecord::Full {
metadata,
sequences,
}
}
pub fn to_collection(&self) -> SequenceCollection {
match self {
SequenceCollectionRecord::Stub(meta) => {
SequenceCollection {
metadata: meta.clone(),
sequences: Vec::new(),
}
}
SequenceCollectionRecord::Full {
metadata,
sequences,
} => SequenceCollection {
metadata: metadata.clone(),
sequences: sequences.clone(),
},
}
}
}
impl From<SequenceCollection> for SequenceCollectionRecord {
fn from(collection: SequenceCollection) -> Self {
SequenceCollectionRecord::Full {
metadata: collection.metadata,
sequences: collection.sequences,
}
}
}
pub fn digest_sequence(name: &str, data: &[u8]) -> SequenceRecord {
let uppercased: Vec<u8> = data.iter().map(|b| b.to_ascii_uppercase()).collect();
let metadata = SequenceMetadata {
name: name.to_string(),
description: None,
length: data.len(),
sha512t24u: sha512t24u(&uppercased),
md5: md5(&uppercased),
alphabet: guess_alphabet(&uppercased),
fai: None, };
SequenceRecord::Full {
metadata,
sequence: uppercased,
}
}
pub fn digest_sequence_with_description(
name: &str,
description: Option<&str>,
data: &[u8],
) -> SequenceRecord {
let mut seq = digest_sequence(name, data);
if let SequenceRecord::Full {
ref mut metadata, ..
} = seq
{
metadata.description = description.map(String::from);
}
seq
}
pub fn parse_rgsi_line(line: &str) -> Option<SequenceMetadata> {
if line.trim().is_empty() {
return None;
}
let parts: Vec<&str> = line.split('\t').collect();
match parts.len() {
5 => Some(SequenceMetadata {
name: parts[0].to_string(),
description: None,
length: parts[1].parse().ok()?,
alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
sha512t24u: parts[3].to_string(),
md5: parts[4].to_string(),
fai: None,
}),
6 => Some(SequenceMetadata {
name: parts[0].to_string(),
description: if parts[5].is_empty() {
None
} else {
Some(parts[5].to_string())
},
length: parts[1].parse().ok()?,
alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
sha512t24u: parts[3].to_string(),
md5: parts[4].to_string(),
fai: None,
}),
_ => None,
}
}
pub fn parse_rgci_line(line: &str) -> Option<SequenceCollectionMetadata> {
if line.starts_with('#') {
return None;
}
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() < 5 {
return None;
}
let opt_col = |i: usize| -> Option<String> {
parts.get(i).and_then(|s| {
if s.is_empty() { None } else { Some(s.to_string()) }
})
};
Some(SequenceCollectionMetadata {
digest: parts[0].to_string(),
n_sequences: parts[1].parse().ok()?,
names_digest: parts[2].to_string(),
sequences_digest: parts[3].to_string(),
lengths_digest: parts[4].to_string(),
name_length_pairs_digest: opt_col(5),
sorted_name_length_pairs_digest: opt_col(6),
sorted_sequences_digest: opt_col(7),
file_path: None,
})
}
#[cfg(test)]
mod tests {
use super::*;
fn test_metadata() -> Vec<SequenceMetadata> {
vec![
SequenceMetadata {
name: "chrX".to_string(),
description: None,
length: 8,
sha512t24u: "abc123".to_string(),
md5: "md5abc".to_string(),
alphabet: AlphabetType::Dna2bit,
fai: None,
},
SequenceMetadata {
name: "chr1".to_string(),
description: None,
length: 4,
sha512t24u: "def456".to_string(),
md5: "md5def".to_string(),
alphabet: AlphabetType::Dna2bit,
fai: None,
},
]
}
#[test]
fn test_ancillary_digest_nlp() {
let metadata = test_metadata();
let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
let nlp = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs);
assert!(!nlp.is_empty());
assert_eq!(nlp.len(), 32); }
#[test]
fn test_ancillary_digest_snlp() {
let metadata = test_metadata();
let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
let snlp = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs);
assert!(!snlp.is_empty());
assert_eq!(snlp.len(), 32);
}
#[test]
fn test_ancillary_digest_sorted_sequences() {
let metadata = test_metadata();
let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
let ss = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs);
assert!(!ss.is_empty());
assert_eq!(ss.len(), 32);
}
#[test]
fn test_nlp_and_snlp_both_valid() {
let metadata = test_metadata();
let refs: Vec<&SequenceMetadata> = metadata.iter().collect();
let nlp = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs);
let snlp = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs);
assert_eq!(nlp.len(), 32);
assert_eq!(snlp.len(), 32);
}
#[test]
fn test_snlp_order_invariant() {
let metadata = test_metadata();
let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
let snlp1 = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs1);
let snlp2 = SeqColDigestLvl1::compute_sorted_name_length_pairs_digest(&refs2);
assert_eq!(snlp1, snlp2);
}
#[test]
fn test_sorted_sequences_order_invariant() {
let metadata = test_metadata();
let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
let ss1 = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs1);
let ss2 = SeqColDigestLvl1::compute_sorted_sequences_digest(&refs2);
assert_eq!(ss1, ss2);
}
#[test]
fn test_nlp_order_sensitive() {
let metadata = test_metadata();
let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
let refs1: Vec<&SequenceMetadata> = metadata.iter().collect();
let refs2: Vec<&SequenceMetadata> = reversed.iter().collect();
let nlp1 = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs1);
let nlp2 = SeqColDigestLvl1::compute_name_length_pairs_digest(&refs2);
assert_ne!(nlp1, nlp2);
}
#[test]
fn test_to_level1() {
let metadata = test_metadata();
let records: Vec<_> = metadata
.iter()
.map(|m| SequenceRecord::Stub(m.clone()))
.collect();
let mut coll_meta = SequenceCollectionMetadata::from_sequences(&records, None);
coll_meta.compute_ancillary_digests(&records);
let lvl1 = coll_meta.to_level1();
assert_eq!(lvl1.names, coll_meta.names_digest);
assert_eq!(lvl1.lengths, coll_meta.lengths_digest);
assert_eq!(lvl1.sequences, coll_meta.sequences_digest);
assert!(lvl1.name_length_pairs.is_some());
assert!(lvl1.sorted_name_length_pairs.is_some());
assert!(lvl1.sorted_sequences.is_some());
}
#[test]
fn test_to_level2() {
let metadata = test_metadata();
let records: Vec<SequenceRecord> = metadata
.iter()
.map(|m| SequenceRecord::Stub(m.clone()))
.collect();
let collection = SequenceCollection::from_records(records);
let lvl2 = collection.to_level2();
assert_eq!(lvl2.names, vec!["chrX", "chr1"]);
assert_eq!(lvl2.lengths, vec![8, 4]);
assert_eq!(lvl2.sequences.len(), 2);
assert!(lvl2.sequences[0].starts_with("SQ."));
}
#[test]
fn test_compare_same() {
let records: Vec<SequenceRecord> = test_metadata().into_iter().map(SequenceRecord::Stub).collect();
let collection = SequenceCollection::from_records(records);
let result = collection.compare(&collection);
assert_eq!(Some(result.digests.a.as_str()), result.digests.b.as_deref());
assert_eq!(result.attributes.a_and_b.len(), 3); for attr in &result.attributes.a_and_b {
assert_eq!(result.array_elements.a_and_b_same_order[attr], Some(true));
}
}
#[test]
fn test_compare_reversed_order() {
let metadata = test_metadata();
let reversed: Vec<SequenceMetadata> = metadata.iter().rev().cloned().collect();
let coll_a = SequenceCollection::from_records(metadata.into_iter().map(SequenceRecord::Stub).collect());
let coll_b = SequenceCollection::from_records(reversed.into_iter().map(SequenceRecord::Stub).collect());
let result = coll_a.compare(&coll_b);
for attr in &result.attributes.a_and_b {
assert_eq!(result.array_elements.a_and_b_count[attr], 2);
assert_eq!(result.array_elements.a_and_b_same_order[attr], Some(false));
}
}
#[test]
fn test_compare_single_element() {
let meta = SequenceMetadata {
name: "chr1".to_string(),
description: None,
length: 4,
sha512t24u: "abc".to_string(),
md5: "md5".to_string(),
alphabet: AlphabetType::Dna2bit,
fai: None,
};
let coll = SequenceCollection::from_records(vec![SequenceRecord::Stub(meta)]);
let result = coll.compare(&coll);
for attr in &result.attributes.a_and_b {
assert_eq!(result.array_elements.a_and_b_same_order[attr], None);
}
}
}