use std::collections::{BTreeMap, HashSet};
use refget_digest::{digest_json, sha512t24u};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)]
pub enum Level {
Zero,
One,
Two,
}
impl Level {
pub fn from_int(n: u8) -> Option<Self> {
match n {
0 => Some(Self::Zero),
1 => Some(Self::One),
2 => Some(Self::Two),
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SeqCol {
pub names: Vec<String>,
pub lengths: Vec<u64>,
pub sequences: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub sorted_name_length_pairs: Option<Vec<String>>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SeqColLevel1 {
pub names: String,
pub lengths: String,
pub sequences: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub sorted_name_length_pairs: Option<String>,
}
impl SeqCol {
pub fn validate(&self) -> Result<(), SeqColError> {
let n = self.names.len();
if self.lengths.len() != n {
return Err(SeqColError::MismatchedArrayLengths {
expected: n,
attribute: "lengths".to_string(),
actual: self.lengths.len(),
});
}
if self.sequences.len() != n {
return Err(SeqColError::MismatchedArrayLengths {
expected: n,
attribute: "sequences".to_string(),
actual: self.sequences.len(),
});
}
Ok(())
}
pub fn digest(&self) -> String {
let level1 = self.to_level1_inherent();
let obj = serde_json::json!({
"lengths": level1.lengths,
"names": level1.names,
"sequences": level1.sequences,
});
digest_json(&obj)
}
pub fn to_level1(&self) -> SeqColLevel1 {
let mut level1 = self.to_level1_inherent();
level1.sorted_name_length_pairs =
Some(digest_string_array(&self.sorted_name_length_pairs()));
level1
}
fn to_level1_inherent(&self) -> SeqColLevel1 {
SeqColLevel1 {
names: digest_string_array(&self.names),
lengths: digest_u64_array(&self.lengths),
sequences: digest_string_array(&self.sequences),
sorted_name_length_pairs: None,
}
}
pub fn sorted_name_length_pairs(&self) -> Vec<String> {
let mut pairs = self.name_length_pairs();
pairs.sort();
pairs
}
pub fn name_length_pairs(&self) -> Vec<String> {
self.names
.iter()
.zip(self.lengths.iter())
.map(|(name, length)| sha512t24u(format!("{name}:{length}").as_bytes()))
.collect()
}
pub fn to_json(&self, level: Level) -> serde_json::Value {
match level {
Level::Zero => serde_json::Value::String(self.digest()),
Level::One => serde_json::to_value(self.to_level1()).unwrap(),
Level::Two => {
let mut col = self.clone();
col.sorted_name_length_pairs = Some(self.sorted_name_length_pairs());
serde_json::to_value(col).unwrap()
}
}
}
}
pub fn compare(a: &SeqCol, b: &SeqCol) -> ComparisonResult {
let a_digest = a.digest();
let b_digest = b.digest();
let a_and_b: Vec<String> = INHERENT_ATTRIBUTES.iter().map(|s| (*s).to_string()).collect();
let a_only: Vec<String> = vec![];
let b_only: Vec<String> = vec![];
let mut array_elements = BTreeMap::new();
for attr in &a_and_b {
let (a_vals, b_vals) = get_attribute_strings(a, b, attr);
let a_set: HashSet<&str> = a_vals.iter().map(String::as_str).collect();
let b_set: HashSet<&str> = b_vals.iter().map(String::as_str).collect();
let total_a = a_vals.len();
let total_b = b_vals.len();
let a_and_b_count = a_set.intersection(&b_set).count();
let a_only_count = a_set.difference(&b_set).count();
let b_only_count = b_set.difference(&a_set).count();
let order = if a_vals == b_vals { OrderResult::Match } else { OrderResult::Differ };
array_elements.insert(
attr.clone(),
ArrayElementComparison {
total_a,
total_b,
a_and_b: a_and_b_count,
a_only: a_only_count,
b_only: b_only_count,
order,
},
);
}
ComparisonResult {
digests: DigestComparison { a: a_digest, b: b_digest },
attributes: AttributeComparison { a_only, b_only, a_and_b },
array_elements,
}
}
const INHERENT_ATTRIBUTES: &[&str] = &["names", "lengths", "sequences"];
fn get_attribute_strings(a: &SeqCol, b: &SeqCol, attr: &str) -> (Vec<String>, Vec<String>) {
match attr {
"names" => (a.names.clone(), b.names.clone()),
"lengths" => (
a.lengths.iter().map(|v| v.to_string()).collect(),
b.lengths.iter().map(|v| v.to_string()).collect(),
),
"sequences" => (a.sequences.clone(), b.sequences.clone()),
_ => (vec![], vec![]),
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ComparisonResult {
pub digests: DigestComparison,
pub attributes: AttributeComparison,
pub array_elements: BTreeMap<String, ArrayElementComparison>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DigestComparison {
pub a: String,
pub b: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct AttributeComparison {
pub a_only: Vec<String>,
pub b_only: Vec<String>,
pub a_and_b: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ArrayElementComparison {
pub total_a: usize,
pub total_b: usize,
pub a_and_b: usize,
pub a_only: usize,
pub b_only: usize,
pub order: OrderResult,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum OrderResult {
Match,
Differ,
}
#[derive(Debug, thiserror::Error)]
pub enum SeqColError {
#[error("Array length mismatch: {attribute} has {actual} elements, expected {expected}")]
MismatchedArrayLengths { expected: usize, attribute: String, actual: usize },
}
fn digest_string_array(values: &[String]) -> String {
let json_array: Vec<serde_json::Value> =
values.iter().map(|v| serde_json::Value::String(v.clone())).collect();
let json = serde_json::Value::Array(json_array);
digest_json(&json)
}
fn digest_u64_array(values: &[u64]) -> String {
let json_array: Vec<serde_json::Value> = values.iter().map(|v| serde_json::json!(v)).collect();
let json = serde_json::Value::Array(json_array);
digest_json(&json)
}
#[cfg(test)]
mod tests {
use super::*;
fn example_seqcol() -> SeqCol {
SeqCol {
names: vec!["chr1".to_string(), "chr2".to_string()],
lengths: vec![248956422, 242193529],
sequences: vec![
"SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string(),
"SQ.v7noePfnNpK8ghYXEqZ9NukMXW0".to_string(),
],
sorted_name_length_pairs: None,
}
}
#[test]
fn test_validate_ok() {
let col = example_seqcol();
assert!(col.validate().is_ok());
}
#[test]
fn test_validate_mismatched_lengths() {
let mut col = example_seqcol();
col.lengths.push(100);
assert!(col.validate().is_err());
}
#[test]
fn test_digest_deterministic() {
let col = example_seqcol();
let d1 = col.digest();
let d2 = col.digest();
assert_eq!(d1, d2);
assert_eq!(d1.len(), 32);
}
#[test]
fn test_level1() {
let col = example_seqcol();
let level1 = col.to_level1();
assert_eq!(level1.names.len(), 32);
assert_eq!(level1.lengths.len(), 32);
assert_eq!(level1.sequences.len(), 32);
assert!(level1.sorted_name_length_pairs.is_some());
}
#[test]
fn test_sorted_name_length_pairs() {
let col = example_seqcol();
let pairs = col.sorted_name_length_pairs();
assert_eq!(pairs.len(), 2);
for p in &pairs {
assert_eq!(p.len(), 32);
}
assert!(pairs[0] <= pairs[1]);
}
#[test]
fn test_compare_identical() {
let col = example_seqcol();
let result = compare(&col, &col);
assert_eq!(result.digests.a, result.digests.b);
assert!(result.attributes.a_only.is_empty());
assert!(result.attributes.b_only.is_empty());
assert_eq!(result.attributes.a_and_b.len(), 3);
for elem in result.array_elements.values() {
assert_eq!(elem.a_only, 0);
assert_eq!(elem.b_only, 0);
assert_eq!(elem.order, OrderResult::Match);
}
}
#[test]
fn test_compare_different() {
let a = example_seqcol();
let mut b = example_seqcol();
b.names[0] = "chrX".to_string();
let result = compare(&a, &b);
assert_ne!(result.digests.a, result.digests.b);
let names_cmp = result.array_elements.get("names").unwrap();
assert_eq!(names_cmp.a_only, 1);
assert_eq!(names_cmp.b_only, 1);
}
#[test]
fn test_to_json_levels() {
let col = example_seqcol();
let l0 = col.to_json(Level::Zero);
assert!(l0.is_string());
let l1 = col.to_json(Level::One);
assert!(l1.is_object());
let l2 = col.to_json(Level::Two);
assert!(l2.is_object());
assert!(l2.get("names").unwrap().is_array());
}
#[test]
fn test_level_from_int_invalid_3() {
assert!(Level::from_int(3).is_none());
}
#[test]
fn test_level_from_int_invalid_255() {
assert!(Level::from_int(255).is_none());
}
fn empty_seqcol() -> SeqCol {
SeqCol { names: vec![], lengths: vec![], sequences: vec![], sorted_name_length_pairs: None }
}
#[test]
fn test_validate_all_empty_ok() {
let col = empty_seqcol();
assert!(col.validate().is_ok());
}
#[test]
fn test_validate_sequences_length_mismatch() {
let mut col = example_seqcol();
col.sequences.push("SQ.extra".to_string());
let err = col.validate().unwrap_err();
let msg = err.to_string();
assert!(msg.contains("sequences"), "error should mention 'sequences': {msg}");
}
#[test]
fn test_name_length_pairs_length_and_digest_size() {
let col = example_seqcol();
let pairs = col.name_length_pairs();
assert_eq!(pairs.len(), 2);
for p in &pairs {
assert_eq!(p.len(), 32, "each name-length pair digest should be 32 chars");
}
}
#[test]
fn test_compare_no_overlap() {
let a = example_seqcol();
let b = SeqCol {
names: vec!["chrX".to_string(), "chrY".to_string()],
lengths: vec![1000, 2000],
sequences: vec![
"SQ.aaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string(),
"SQ.bbbbbbbbbbbbbbbbbbbbbbbbbbbb".to_string(),
],
sorted_name_length_pairs: None,
};
let result = compare(&a, &b);
assert_ne!(result.digests.a, result.digests.b);
for elem in result.array_elements.values() {
assert_eq!(elem.a_and_b, 0, "no elements should overlap");
assert_eq!(elem.a_only, elem.total_a);
assert_eq!(elem.b_only, elem.total_b);
}
}
#[test]
fn test_compare_different_lengths() {
let a = example_seqcol();
let b = SeqCol {
names: vec!["chr1".to_string()],
lengths: vec![248956422],
sequences: vec!["SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string()],
sorted_name_length_pairs: None,
};
let result = compare(&a, &b);
let names_cmp = result.array_elements.get("names").unwrap();
assert_eq!(names_cmp.total_a, 2);
assert_eq!(names_cmp.total_b, 1);
assert_eq!(names_cmp.a_and_b, 1);
assert_eq!(names_cmp.a_only, 1);
assert_eq!(names_cmp.b_only, 0);
}
#[test]
fn test_compare_same_elements_different_order() {
let a = example_seqcol();
let b = SeqCol {
names: vec!["chr2".to_string(), "chr1".to_string()],
lengths: vec![242193529, 248956422],
sequences: vec![
"SQ.v7noePfnNpK8ghYXEqZ9NukMXW0".to_string(),
"SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string(),
],
sorted_name_length_pairs: None,
};
let result = compare(&a, &b);
assert_ne!(result.digests.a, result.digests.b);
for elem in result.array_elements.values() {
assert_eq!(elem.order, OrderResult::Differ, "order should differ");
assert_eq!(elem.a_and_b, elem.total_a, "all elements of a should be in b");
assert_eq!(elem.a_and_b, elem.total_b, "all elements of b should be in a");
assert_eq!(elem.a_only, 0);
assert_eq!(elem.b_only, 0);
}
}
#[test]
fn test_to_json_level_zero_is_string() {
let col = example_seqcol();
let json = col.to_json(Level::Zero);
assert!(json.is_string(), "Level::Zero JSON should be a string");
assert_eq!(json.as_str().unwrap().len(), 32, "Level::Zero digest should be 32 chars");
}
#[test]
fn test_to_json_level_two_has_sorted_name_length_pairs() {
let col = example_seqcol();
let json = col.to_json(Level::Two);
let snlp = json.get("sorted_name_length_pairs");
assert!(snlp.is_some(), "Level::Two should include sorted_name_length_pairs");
assert!(snlp.unwrap().is_array());
}
#[test]
fn test_empty_collection_digests_are_valid() {
let col = empty_seqcol();
let d = col.digest();
assert_eq!(d.len(), 32, "digest of empty collection should be 32 chars");
let level1 = col.to_level1();
assert_eq!(level1.names.len(), 32);
assert_eq!(level1.lengths.len(), 32);
assert_eq!(level1.sequences.len(), 32);
assert_eq!(level1.names, level1.sequences);
}
#[test]
fn test_single_element_seqcol() {
let col = SeqCol {
names: vec!["chrM".to_string()],
lengths: vec![16569],
sequences: vec!["SQ.someDigest_chrM_placeholder00".to_string()],
sorted_name_length_pairs: None,
};
assert!(col.validate().is_ok());
let d = col.digest();
assert_eq!(d.len(), 32);
let level1 = col.to_level1();
assert_eq!(level1.names.len(), 32);
assert_eq!(level1.lengths.len(), 32);
assert_eq!(level1.sequences.len(), 32);
assert!(level1.sorted_name_length_pairs.is_some());
assert_eq!(level1.sorted_name_length_pairs.unwrap().len(), 32);
}
}