1use std::collections::{BTreeMap, HashSet};
4
5use refget_digest::{digest_json, sha512t24u};
6use serde::{Deserialize, Serialize};
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)]
10pub enum Level {
11 Zero,
13 One,
15 Two,
17}
18
19impl Level {
20 pub fn from_int(n: u8) -> Option<Self> {
22 match n {
23 0 => Some(Self::Zero),
24 1 => Some(Self::One),
25 2 => Some(Self::Two),
26 _ => None,
27 }
28 }
29}
30
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33pub struct SeqCol {
34 pub names: Vec<String>,
36 pub lengths: Vec<u64>,
38 pub sequences: Vec<String>,
40 #[serde(skip_serializing_if = "Option::is_none")]
42 pub sorted_name_length_pairs: Option<Vec<String>>,
43}
44
45#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
47pub struct SeqColLevel1 {
48 pub names: String,
49 pub lengths: String,
50 pub sequences: String,
51 #[serde(skip_serializing_if = "Option::is_none")]
52 pub sorted_name_length_pairs: Option<String>,
53}
54
55impl SeqCol {
56 pub fn validate(&self) -> Result<(), SeqColError> {
58 let n = self.names.len();
59 if self.lengths.len() != n {
60 return Err(SeqColError::MismatchedArrayLengths {
61 expected: n,
62 attribute: "lengths".to_string(),
63 actual: self.lengths.len(),
64 });
65 }
66 if self.sequences.len() != n {
67 return Err(SeqColError::MismatchedArrayLengths {
68 expected: n,
69 attribute: "sequences".to_string(),
70 actual: self.sequences.len(),
71 });
72 }
73 Ok(())
74 }
75
76 pub fn digest(&self) -> String {
82 let level1 = self.to_level1_inherent();
83 let obj = serde_json::json!({
84 "lengths": level1.lengths,
85 "names": level1.names,
86 "sequences": level1.sequences,
87 });
88 digest_json(&obj)
89 }
90
91 pub fn to_level1(&self) -> SeqColLevel1 {
93 let mut level1 = self.to_level1_inherent();
94 level1.sorted_name_length_pairs =
95 Some(digest_string_array(&self.sorted_name_length_pairs()));
96 level1
97 }
98
99 fn to_level1_inherent(&self) -> SeqColLevel1 {
101 SeqColLevel1 {
102 names: digest_string_array(&self.names),
103 lengths: digest_u64_array(&self.lengths),
104 sequences: digest_string_array(&self.sequences),
105 sorted_name_length_pairs: None,
106 }
107 }
108
109 pub fn sorted_name_length_pairs(&self) -> Vec<String> {
113 let mut pairs = self.name_length_pairs();
114 pairs.sort();
115 pairs
116 }
117
118 pub fn name_length_pairs(&self) -> Vec<String> {
120 self.names
121 .iter()
122 .zip(self.lengths.iter())
123 .map(|(name, length)| sha512t24u(format!("{name}:{length}").as_bytes()))
124 .collect()
125 }
126
127 pub fn to_json(&self, level: Level) -> serde_json::Value {
129 match level {
130 Level::Zero => serde_json::Value::String(self.digest()),
131 Level::One => serde_json::to_value(self.to_level1()).unwrap(),
132 Level::Two => {
133 let mut col = self.clone();
134 col.sorted_name_length_pairs = Some(self.sorted_name_length_pairs());
135 serde_json::to_value(col).unwrap()
136 }
137 }
138 }
139}
140
141pub fn compare(a: &SeqCol, b: &SeqCol) -> ComparisonResult {
143 let a_digest = a.digest();
144 let b_digest = b.digest();
145 let a_and_b: Vec<String> = INHERENT_ATTRIBUTES.iter().map(|s| (*s).to_string()).collect();
147 let a_only: Vec<String> = vec![];
148 let b_only: Vec<String> = vec![];
149
150 let mut array_elements = BTreeMap::new();
152 for attr in &a_and_b {
153 let (a_vals, b_vals) = get_attribute_strings(a, b, attr);
154 let a_set: HashSet<&str> = a_vals.iter().map(String::as_str).collect();
155 let b_set: HashSet<&str> = b_vals.iter().map(String::as_str).collect();
156
157 let total_a = a_vals.len();
158 let total_b = b_vals.len();
159 let a_and_b_count = a_set.intersection(&b_set).count();
160 let a_only_count = a_set.difference(&b_set).count();
161 let b_only_count = b_set.difference(&a_set).count();
162 let order = if a_vals == b_vals { OrderResult::Match } else { OrderResult::Differ };
163
164 array_elements.insert(
165 attr.clone(),
166 ArrayElementComparison {
167 total_a,
168 total_b,
169 a_and_b: a_and_b_count,
170 a_only: a_only_count,
171 b_only: b_only_count,
172 order,
173 },
174 );
175 }
176
177 ComparisonResult {
178 digests: DigestComparison { a: a_digest, b: b_digest },
179 attributes: AttributeComparison { a_only, b_only, a_and_b },
180 array_elements,
181 }
182}
183
184const INHERENT_ATTRIBUTES: &[&str] = &["names", "lengths", "sequences"];
186
187fn get_attribute_strings(a: &SeqCol, b: &SeqCol, attr: &str) -> (Vec<String>, Vec<String>) {
189 match attr {
190 "names" => (a.names.clone(), b.names.clone()),
191 "lengths" => (
192 a.lengths.iter().map(|v| v.to_string()).collect(),
193 b.lengths.iter().map(|v| v.to_string()).collect(),
194 ),
195 "sequences" => (a.sequences.clone(), b.sequences.clone()),
196 _ => (vec![], vec![]),
197 }
198}
199
200#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
202pub struct ComparisonResult {
203 pub digests: DigestComparison,
204 pub attributes: AttributeComparison,
205 pub array_elements: BTreeMap<String, ArrayElementComparison>,
206}
207
208#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
210pub struct DigestComparison {
211 pub a: String,
212 pub b: String,
213}
214
215#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
217pub struct AttributeComparison {
218 pub a_only: Vec<String>,
219 pub b_only: Vec<String>,
220 pub a_and_b: Vec<String>,
221}
222
223#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
225pub struct ArrayElementComparison {
226 pub total_a: usize,
227 pub total_b: usize,
228 pub a_and_b: usize,
229 pub a_only: usize,
230 pub b_only: usize,
231 pub order: OrderResult,
232}
233
234#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
236#[serde(rename_all = "lowercase")]
237pub enum OrderResult {
238 Match,
239 Differ,
240}
241
242#[derive(Debug, thiserror::Error)]
244pub enum SeqColError {
245 #[error("Array length mismatch: {attribute} has {actual} elements, expected {expected}")]
246 MismatchedArrayLengths { expected: usize, attribute: String, actual: usize },
247}
248
249fn digest_string_array(values: &[String]) -> String {
252 let json_array: Vec<serde_json::Value> =
253 values.iter().map(|v| serde_json::Value::String(v.clone())).collect();
254 let json = serde_json::Value::Array(json_array);
255 digest_json(&json)
256}
257
258fn digest_u64_array(values: &[u64]) -> String {
260 let json_array: Vec<serde_json::Value> = values.iter().map(|v| serde_json::json!(v)).collect();
261 let json = serde_json::Value::Array(json_array);
262 digest_json(&json)
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 fn example_seqcol() -> SeqCol {
270 SeqCol {
271 names: vec!["chr1".to_string(), "chr2".to_string()],
272 lengths: vec![248956422, 242193529],
273 sequences: vec![
274 "SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string(),
275 "SQ.v7noePfnNpK8ghYXEqZ9NukMXW0".to_string(),
276 ],
277 sorted_name_length_pairs: None,
278 }
279 }
280
281 #[test]
282 fn test_validate_ok() {
283 let col = example_seqcol();
284 assert!(col.validate().is_ok());
285 }
286
287 #[test]
288 fn test_validate_mismatched_lengths() {
289 let mut col = example_seqcol();
290 col.lengths.push(100);
291 assert!(col.validate().is_err());
292 }
293
294 #[test]
295 fn test_digest_deterministic() {
296 let col = example_seqcol();
297 let d1 = col.digest();
298 let d2 = col.digest();
299 assert_eq!(d1, d2);
300 assert_eq!(d1.len(), 32);
301 }
302
303 #[test]
304 fn test_level1() {
305 let col = example_seqcol();
306 let level1 = col.to_level1();
307 assert_eq!(level1.names.len(), 32);
308 assert_eq!(level1.lengths.len(), 32);
309 assert_eq!(level1.sequences.len(), 32);
310 assert!(level1.sorted_name_length_pairs.is_some());
311 }
312
313 #[test]
314 fn test_sorted_name_length_pairs() {
315 let col = example_seqcol();
316 let pairs = col.sorted_name_length_pairs();
317 assert_eq!(pairs.len(), 2);
318 for p in &pairs {
320 assert_eq!(p.len(), 32);
321 }
322 assert!(pairs[0] <= pairs[1]);
324 }
325
326 #[test]
327 fn test_compare_identical() {
328 let col = example_seqcol();
329 let result = compare(&col, &col);
330 assert_eq!(result.digests.a, result.digests.b);
331 assert!(result.attributes.a_only.is_empty());
332 assert!(result.attributes.b_only.is_empty());
333 assert_eq!(result.attributes.a_and_b.len(), 3);
334 for elem in result.array_elements.values() {
335 assert_eq!(elem.a_only, 0);
336 assert_eq!(elem.b_only, 0);
337 assert_eq!(elem.order, OrderResult::Match);
338 }
339 }
340
341 #[test]
342 fn test_compare_different() {
343 let a = example_seqcol();
344 let mut b = example_seqcol();
345 b.names[0] = "chrX".to_string();
346 let result = compare(&a, &b);
347 assert_ne!(result.digests.a, result.digests.b);
348 let names_cmp = result.array_elements.get("names").unwrap();
349 assert_eq!(names_cmp.a_only, 1);
350 assert_eq!(names_cmp.b_only, 1);
351 }
352
353 #[test]
354 fn test_to_json_levels() {
355 let col = example_seqcol();
356 let l0 = col.to_json(Level::Zero);
357 assert!(l0.is_string());
358 let l1 = col.to_json(Level::One);
359 assert!(l1.is_object());
360 let l2 = col.to_json(Level::Two);
361 assert!(l2.is_object());
362 assert!(l2.get("names").unwrap().is_array());
363 }
364
365 #[test]
368 fn test_level_from_int_invalid_3() {
369 assert!(Level::from_int(3).is_none());
370 }
371
372 #[test]
373 fn test_level_from_int_invalid_255() {
374 assert!(Level::from_int(255).is_none());
375 }
376
377 fn empty_seqcol() -> SeqCol {
380 SeqCol { names: vec![], lengths: vec![], sequences: vec![], sorted_name_length_pairs: None }
381 }
382
383 #[test]
384 fn test_validate_all_empty_ok() {
385 let col = empty_seqcol();
386 assert!(col.validate().is_ok());
387 }
388
389 #[test]
390 fn test_validate_sequences_length_mismatch() {
391 let mut col = example_seqcol();
392 col.sequences.push("SQ.extra".to_string());
393 let err = col.validate().unwrap_err();
394 let msg = err.to_string();
395 assert!(msg.contains("sequences"), "error should mention 'sequences': {msg}");
396 }
397
398 #[test]
401 fn test_name_length_pairs_length_and_digest_size() {
402 let col = example_seqcol();
403 let pairs = col.name_length_pairs();
404 assert_eq!(pairs.len(), 2);
405 for p in &pairs {
406 assert_eq!(p.len(), 32, "each name-length pair digest should be 32 chars");
407 }
408 }
409
410 #[test]
413 fn test_compare_no_overlap() {
414 let a = example_seqcol();
415 let b = SeqCol {
416 names: vec!["chrX".to_string(), "chrY".to_string()],
417 lengths: vec![1000, 2000],
418 sequences: vec![
419 "SQ.aaaaaaaaaaaaaaaaaaaaaaaaaaaa".to_string(),
420 "SQ.bbbbbbbbbbbbbbbbbbbbbbbbbbbb".to_string(),
421 ],
422 sorted_name_length_pairs: None,
423 };
424 let result = compare(&a, &b);
425 assert_ne!(result.digests.a, result.digests.b);
426 for elem in result.array_elements.values() {
427 assert_eq!(elem.a_and_b, 0, "no elements should overlap");
428 assert_eq!(elem.a_only, elem.total_a);
429 assert_eq!(elem.b_only, elem.total_b);
430 }
431 }
432
433 #[test]
436 fn test_compare_different_lengths() {
437 let a = example_seqcol();
438 let b = SeqCol {
439 names: vec!["chr1".to_string()],
440 lengths: vec![248956422],
441 sequences: vec!["SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string()],
442 sorted_name_length_pairs: None,
443 };
444 let result = compare(&a, &b);
445 let names_cmp = result.array_elements.get("names").unwrap();
446 assert_eq!(names_cmp.total_a, 2);
447 assert_eq!(names_cmp.total_b, 1);
448 assert_eq!(names_cmp.a_and_b, 1);
449 assert_eq!(names_cmp.a_only, 1);
450 assert_eq!(names_cmp.b_only, 0);
451 }
452
453 #[test]
456 fn test_compare_same_elements_different_order() {
457 let a = example_seqcol();
458 let b = SeqCol {
459 names: vec!["chr2".to_string(), "chr1".to_string()],
460 lengths: vec![242193529, 248956422],
461 sequences: vec![
462 "SQ.v7noePfnNpK8ghYXEqZ9NukMXW0".to_string(),
463 "SQ.IIB53T8CNeJJdUqzn1V4W1SqtRA".to_string(),
464 ],
465 sorted_name_length_pairs: None,
466 };
467 let result = compare(&a, &b);
468 assert_ne!(result.digests.a, result.digests.b);
470 for elem in result.array_elements.values() {
471 assert_eq!(elem.order, OrderResult::Differ, "order should differ");
472 assert_eq!(elem.a_and_b, elem.total_a, "all elements of a should be in b");
473 assert_eq!(elem.a_and_b, elem.total_b, "all elements of b should be in a");
474 assert_eq!(elem.a_only, 0);
475 assert_eq!(elem.b_only, 0);
476 }
477 }
478
479 #[test]
482 fn test_to_json_level_zero_is_string() {
483 let col = example_seqcol();
484 let json = col.to_json(Level::Zero);
485 assert!(json.is_string(), "Level::Zero JSON should be a string");
486 assert_eq!(json.as_str().unwrap().len(), 32, "Level::Zero digest should be 32 chars");
487 }
488
489 #[test]
492 fn test_to_json_level_two_has_sorted_name_length_pairs() {
493 let col = example_seqcol();
494 let json = col.to_json(Level::Two);
495 let snlp = json.get("sorted_name_length_pairs");
496 assert!(snlp.is_some(), "Level::Two should include sorted_name_length_pairs");
497 assert!(snlp.unwrap().is_array());
498 }
499
500 #[test]
503 fn test_empty_collection_digests_are_valid() {
504 let col = empty_seqcol();
505 let d = col.digest();
507 assert_eq!(d.len(), 32, "digest of empty collection should be 32 chars");
508
509 let level1 = col.to_level1();
510 assert_eq!(level1.names.len(), 32);
511 assert_eq!(level1.lengths.len(), 32);
512 assert_eq!(level1.sequences.len(), 32);
513 assert_eq!(level1.names, level1.sequences);
515 }
516
517 #[test]
520 fn test_single_element_seqcol() {
521 let col = SeqCol {
522 names: vec!["chrM".to_string()],
523 lengths: vec![16569],
524 sequences: vec!["SQ.someDigest_chrM_placeholder00".to_string()],
525 sorted_name_length_pairs: None,
526 };
527 assert!(col.validate().is_ok());
528
529 let d = col.digest();
530 assert_eq!(d.len(), 32);
531
532 let level1 = col.to_level1();
533 assert_eq!(level1.names.len(), 32);
534 assert_eq!(level1.lengths.len(), 32);
535 assert_eq!(level1.sequences.len(), 32);
536 assert!(level1.sorted_name_length_pairs.is_some());
537 assert_eq!(level1.sorted_name_length_pairs.unwrap().len(), 32);
538 }
539}