1use crate::Archive;
5use anyhow::{Context, Result};
6use std::collections::HashMap;
7
8#[derive(Debug, Clone, Copy, PartialEq)]
34pub struct SegmentDesc {
35 pub group_id: u32,
36 pub in_group_id: u32, pub is_rev_comp: bool,
38 pub raw_length: u32,
39}
40
41impl SegmentDesc {
42 pub fn new(group_id: u32, in_group_id: u32, is_rev_comp: bool, raw_length: u32) -> Self {
43 SegmentDesc {
44 group_id,
45 in_group_id,
46 is_rev_comp,
47 raw_length,
48 }
49 }
50
51 pub fn empty() -> Self {
52 SegmentDesc {
53 group_id: u32::MAX,
54 in_group_id: u32::MAX,
55 is_rev_comp: false,
56 raw_length: 0,
57 }
58 }
59}
60
61#[derive(Debug, Clone)]
63pub struct ContigDesc {
64 pub name: String,
65 pub segments: Vec<SegmentDesc>,
66}
67
68impl ContigDesc {
69 pub fn new(name: String) -> Self {
70 ContigDesc {
71 name,
72 segments: Vec::new(),
73 }
74 }
75}
76
77#[derive(Debug, Clone)]
79pub struct SampleDesc {
80 pub name: String,
81 pub contigs: Vec<ContigDesc>,
82}
83
84impl SampleDesc {
85 pub fn new(name: String) -> Self {
86 SampleDesc {
87 name,
88 contigs: Vec::new(),
89 }
90 }
91}
92
93pub struct CollectionVarInt;
102
103impl CollectionVarInt {
104 const THR_1: u32 = 1 << 7;
105 const THR_2: u32 = Self::THR_1 + (1 << 14);
106 const THR_3: u32 = Self::THR_2 + (1 << 21);
107 const THR_4: u32 = Self::THR_3 + (1 << 28);
108
109 const PREF_1: u8 = 0;
110 const PREF_2: u8 = 0b10000000u8;
111 const PREF_3: u8 = 0b11000000u8;
112 const PREF_4: u8 = 0b11100000u8;
113 const PREF_5: u8 = 0b11110000u8;
114
115 const MASK_1: u8 = 0b10000000u8;
116 const MASK_2: u8 = 0b11000000u8;
117 const MASK_3: u8 = 0b11100000u8;
118 const MASK_4: u8 = 0b11110000u8;
119
120 pub fn encode(data: &mut Vec<u8>, num: u32) {
121 if num < Self::THR_1 {
122 data.push(Self::PREF_1 + num as u8);
123 } else if num < Self::THR_2 {
124 let num = num - Self::THR_1;
125 data.push(Self::PREF_2 + (num >> 8) as u8);
126 data.push((num & 0xff) as u8);
127 } else if num < Self::THR_3 {
128 let num = num - Self::THR_2;
129 data.push(Self::PREF_3 + (num >> 16) as u8);
130 data.push(((num >> 8) & 0xff) as u8);
131 data.push((num & 0xff) as u8);
132 } else if num < Self::THR_4 {
133 let num = num - Self::THR_3;
134 data.push(Self::PREF_4 + (num >> 24) as u8);
135 data.push(((num >> 16) & 0xff) as u8);
136 data.push(((num >> 8) & 0xff) as u8);
137 data.push((num & 0xff) as u8);
138 } else {
139 let num = num - Self::THR_4;
140 data.push(Self::PREF_5);
141 data.push(((num >> 24) & 0xff) as u8);
142 data.push(((num >> 16) & 0xff) as u8);
143 data.push(((num >> 8) & 0xff) as u8);
144 data.push((num & 0xff) as u8);
145 }
146 }
147
148 pub fn decode(ptr: &mut &[u8]) -> Result<u32> {
149 if ptr.is_empty() {
150 anyhow::bail!("Unexpected end of data while decoding varint");
151 }
152
153 let first = ptr[0];
154
155 if (first & Self::MASK_1) == Self::PREF_1 {
156 let num = (first - Self::PREF_1) as u32;
157 *ptr = &ptr[1..];
158 Ok(num)
159 } else if (first & Self::MASK_2) == Self::PREF_2 {
160 if ptr.len() < 2 {
161 anyhow::bail!("Unexpected end of data while decoding 2-byte varint");
162 }
163 let num =
164 ((ptr[0] as u32) << 8) + ptr[1] as u32 + Self::THR_1 - ((Self::PREF_2 as u32) << 8);
165 *ptr = &ptr[2..];
166 Ok(num)
167 } else if (first & Self::MASK_3) == Self::PREF_3 {
168 if ptr.len() < 3 {
169 anyhow::bail!("Unexpected end of data while decoding 3-byte varint");
170 }
171 let num =
172 ((ptr[0] as u32) << 16) + ((ptr[1] as u32) << 8) + ptr[2] as u32 + Self::THR_2
173 - ((Self::PREF_3 as u32) << 16);
174 *ptr = &ptr[3..];
175 Ok(num)
176 } else if (first & Self::MASK_4) == Self::PREF_4 {
177 if ptr.len() < 4 {
178 anyhow::bail!("Unexpected end of data while decoding 4-byte varint");
179 }
180 let num = ((ptr[0] as u32) << 24)
181 + ((ptr[1] as u32) << 16)
182 + ((ptr[2] as u32) << 8)
183 + ptr[3] as u32
184 + Self::THR_3
185 - ((Self::PREF_4 as u32) << 24);
186 *ptr = &ptr[4..];
187 Ok(num)
188 } else {
189 if ptr.len() < 5 {
191 anyhow::bail!("Unexpected end of data while decoding 5-byte varint");
192 }
193 let mut num = ptr[1] as u32;
194 num <<= 8;
195 num += ptr[2] as u32;
196 num <<= 8;
197 num += ptr[3] as u32;
198 num <<= 8;
199 num += ptr[4] as u32;
200 num += Self::THR_4;
201 *ptr = &ptr[5..];
202 Ok(num)
203 }
204 }
205
206 pub fn encode_string(data: &mut Vec<u8>, s: &str) {
207 data.extend_from_slice(s.as_bytes());
208 data.push(0);
209 }
210
211 pub fn decode_string(ptr: &mut &[u8]) -> Result<String> {
212 let end = ptr
213 .iter()
214 .position(|&b| b == 0)
215 .context("Null terminator not found in string")?;
216
217 let s = String::from_utf8(ptr[..end].to_vec()).context("Invalid UTF-8 in string")?;
218
219 *ptr = &ptr[end + 1..];
220 Ok(s)
221 }
222}
223
224pub fn zigzag_encode_i64(x: i64) -> u64 {
226 if x >= 0 {
227 (2 * x) as u64
228 } else {
229 (2 * (-x) - 1) as u64
230 }
231}
232
233pub fn zigzag_decode_i64(x: u64) -> i64 {
234 if x & 1 != 0 {
235 -(x.div_ceil(2) as i64)
236 } else {
237 (x / 2) as i64
238 }
239}
240
241pub fn zigzag_encode(x_curr: u64, x_prev: u64) -> u64 {
243 if x_curr < x_prev {
244 2 * (x_prev - x_curr) - 1
245 } else if x_curr < 2 * x_prev {
246 2 * (x_curr - x_prev)
247 } else {
248 x_curr
249 }
250}
251
252pub fn zigzag_decode(x_val: u64, x_prev: u64) -> u64 {
253 if x_val >= 2 * x_prev {
254 x_val
255 } else if x_val & 1 != 0 {
256 (2 * x_prev - x_val) / 2
257 } else {
258 (x_val + 2 * x_prev) / 2
259 }
260}
261
262pub struct CollectionV3 {
264 sample_desc: Vec<SampleDesc>,
265 sample_ids: HashMap<String, usize>,
266
267 collection_samples_id: Option<usize>,
269 collection_contigs_id: Option<usize>,
270 collection_details_id: Option<usize>,
271
272 batch_size: usize,
274 segment_size: u32,
275 kmer_length: u32,
276
277 prev_sample_name: String,
279 #[allow(dead_code)]
280 placing_sample_name: String,
281 #[allow(dead_code)]
282 placing_sample_id: usize,
283 no_samples_in_last_batch: usize,
284
285 in_group_ids: Vec<i32>,
287}
288
289impl Default for CollectionV3 {
290 fn default() -> Self {
291 Self::new()
292 }
293}
294
295impl CollectionV3 {
296 pub fn new() -> Self {
297 CollectionV3 {
298 sample_desc: Vec::new(),
299 sample_ids: HashMap::new(),
300 collection_samples_id: None,
301 collection_contigs_id: None,
302 collection_details_id: None,
303 batch_size: 1 << 20, segment_size: 0,
305 kmer_length: 0,
306 prev_sample_name: String::new(),
307 placing_sample_name: String::new(),
308 placing_sample_id: 0,
309 no_samples_in_last_batch: 0,
310 in_group_ids: Vec::new(),
311 }
312 }
313
314 pub fn set_config(&mut self, segment_size: u32, kmer_length: u32, batch_size: Option<usize>) {
315 self.segment_size = segment_size;
316 self.kmer_length = kmer_length;
317 if let Some(bs) = batch_size {
318 self.batch_size = bs;
319 }
320 }
321
322 pub fn prepare_for_compression(&mut self, archive: &mut Archive) -> Result<()> {
323 self.collection_samples_id = Some(archive.register_stream("collection-samples"));
324 self.collection_contigs_id = Some(archive.register_stream("collection-contigs"));
325 self.collection_details_id = Some(archive.register_stream("collection-details"));
326 Ok(())
327 }
328
329 pub fn prepare_for_decompression(&mut self, archive: &Archive) -> Result<()> {
330 self.collection_samples_id = archive.get_stream_id("collection-samples");
331 self.collection_contigs_id = archive.get_stream_id("collection-contigs");
332 self.collection_details_id = archive.get_stream_id("collection-details");
333
334 if self.collection_samples_id.is_none() {
335 anyhow::bail!("collection-samples stream not found in archive");
336 }
337 if self.collection_contigs_id.is_none() {
338 anyhow::bail!("collection-contigs stream not found in archive");
339 }
340 if self.collection_details_id.is_none() {
341 anyhow::bail!("collection-details stream not found in archive");
342 }
343
344 Ok(())
345 }
346
347 pub fn register_sample_contig(&mut self, sample_name: &str, contig_name: &str) -> Result<bool> {
349 let mut stored_sample_name = sample_name.to_string();
350
351 if sample_name.is_empty() {
352 stored_sample_name = Self::extract_contig_name(contig_name);
353 }
354
355 let sample_id = if let Some(&id) = self.sample_ids.get(&stored_sample_name) {
357 id
359 } else {
360 let id = self.sample_ids.len();
362 self.sample_ids.insert(stored_sample_name.clone(), id);
363 self.sample_desc
364 .push(SampleDesc::new(stored_sample_name.clone()));
365 id
366 };
367
368 self.prev_sample_name = stored_sample_name;
369
370 let sample = &mut self.sample_desc[sample_id];
372 if !sample.contigs.iter().any(|c| c.name == contig_name) {
373 sample
374 .contigs
375 .push(ContigDesc::new(contig_name.to_string()));
376 Ok(true)
377 } else {
378 Ok(false) }
380 }
381
382 #[allow(clippy::too_many_arguments)]
384 pub fn add_segment_placed(
385 &mut self,
386 sample_name: &str,
387 contig_name: &str,
388 place: usize,
389 group_id: u32,
390 in_group_id: u32,
391 is_rev_comp: bool,
392 raw_length: u32,
393 ) -> Result<()> {
394 let mut stored_sample_name = sample_name.to_string();
395
396 if sample_name.is_empty() {
397 stored_sample_name = Self::extract_contig_name(contig_name);
398 }
399
400 let sample_id = *self
404 .sample_ids
405 .get(&stored_sample_name)
406 .context(format!("Sample not found: {stored_sample_name}"))?;
407
408 if std::env::var("RAGC_DEBUG_REGISTER").is_ok() {
409 eprintln!("REGISTER: sample='{stored_sample_name}' (id={sample_id}), contig='{contig_name}', place={place}");
410 }
411
412 let sample = &mut self.sample_desc[sample_id];
413 for contig in &mut sample.contigs {
414 if contig.name == contig_name {
415 if place >= contig.segments.len() {
416 contig.segments.resize(place + 1, SegmentDesc::empty());
417 }
418 contig.segments[place] =
419 SegmentDesc::new(group_id, in_group_id, is_rev_comp, raw_length);
420 return Ok(());
421 }
422 }
423
424 anyhow::bail!("Contig {contig_name} not found in sample {stored_sample_name}");
425 }
426
427 pub fn get_samples_list(&self, sorted: bool) -> Vec<String> {
429 let mut samples: Vec<String> = self.sample_desc.iter().map(|s| s.name.clone()).collect();
430 if sorted {
431 samples.sort();
432 }
433 samples
434 }
435
436 pub fn get_no_samples(&self) -> usize {
438 self.sample_desc.len()
439 }
440
441 pub fn get_no_contigs(&self, sample_name: &str) -> Option<usize> {
443 self.sample_ids
444 .get(sample_name)
445 .map(|&id| self.sample_desc[id].contigs.len())
446 }
447
448 pub fn get_contig_list(&self, sample_name: &str) -> Option<Vec<String>> {
450 self.sample_ids.get(sample_name).map(|&id| {
451 self.sample_desc[id]
452 .contigs
453 .iter()
454 .map(|c| c.name.clone())
455 .collect()
456 })
457 }
458
459 pub fn get_sample_desc(&self, sample_name: &str) -> Option<Vec<(String, Vec<SegmentDesc>)>> {
461 self.sample_ids.get(sample_name).map(|&id| {
462 self.sample_desc[id]
463 .contigs
464 .iter()
465 .map(|c| (c.name.clone(), c.segments.clone()))
466 .collect()
467 })
468 }
469
470 pub fn get_contig_desc(
472 &self,
473 sample_name: &str,
474 contig_name: &str,
475 ) -> Option<Vec<SegmentDesc>> {
476 self.sample_ids.get(sample_name).and_then(|&id| {
477 self.sample_desc[id]
478 .contigs
479 .iter()
480 .find(|c| c.name == contig_name)
481 .map(|c| c.segments.clone())
482 })
483 }
484
485 pub fn get_params(&self) -> (u32, u32) {
487 (self.segment_size, self.kmer_length)
488 }
489
490 fn extract_contig_name(s: &str) -> String {
492 s.split_whitespace().next().unwrap_or(s).to_string()
493 }
494
495 fn serialize_sample_names(&self) -> Vec<u8> {
499 let mut data = Vec::new();
500 CollectionVarInt::encode(&mut data, self.sample_desc.len() as u32);
501
502 for sample in &self.sample_desc {
503 CollectionVarInt::encode_string(&mut data, &sample.name);
504 }
505
506 data
507 }
508
509 fn deserialize_sample_names(&mut self, data: &[u8]) -> Result<()> {
511 let mut ptr = data;
512
513 let no_samples = CollectionVarInt::decode(&mut ptr)?;
514
515 self.sample_desc.clear();
516 self.sample_ids.clear();
517
518 for i in 0..no_samples {
519 let name = CollectionVarInt::decode_string(&mut ptr)?;
520 self.sample_ids.insert(name.clone(), i as usize);
521 self.sample_desc.push(SampleDesc::new(name));
522 }
523
524 Ok(())
525 }
526
527 fn split_string(s: &str) -> Vec<String> {
529 s.split(' ').map(|s| s.to_string()).collect()
530 }
531
532 fn encode_split(prev_split: &[String], curr_split: &[String]) -> Vec<u8> {
534 let mut enc = Vec::new();
535
536 for i in 0..curr_split.len() {
537 if prev_split[i] == curr_split[i] {
538 enc.push((-127i8) as u8); } else if prev_split[i].len() != curr_split[i].len() {
540 enc.extend_from_slice(curr_split[i].as_bytes());
541 } else {
542 let mut cnt: i8 = 0;
544 let p_bytes = prev_split[i].as_bytes();
545 let c_bytes = curr_split[i].as_bytes();
546
547 for j in 0..c_bytes.len() {
548 if p_bytes[j] == c_bytes[j] {
549 if cnt == 100 {
550 enc.push((-cnt) as u8); cnt = 1;
552 } else {
553 cnt += 1;
554 }
555 } else {
556 if cnt > 0 {
557 enc.push((-cnt) as u8); cnt = 0;
559 }
560 enc.push(c_bytes[j]);
561 }
562 }
563
564 if cnt > 0 {
565 enc.push((-cnt) as u8); }
567 }
568
569 enc.push(b' ');
570 }
571
572 if enc.last() == Some(&b' ') {
574 enc.pop();
575 }
576
577 enc
578 }
579
580 #[allow(clippy::ptr_arg)]
582 fn decode_split(prev_split: &mut Vec<String>, curr_split_bytes: &[u8]) -> String {
583 let parts: Vec<&[u8]> = curr_split_bytes.split(|&b| b == b' ').collect();
585 let mut dec_parts = Vec::new();
586
587 for i in 0..parts.len() {
588 if parts[i].len() == 1 && (parts[i][0] as i8) == -127 {
589 dec_parts.push(prev_split[i].clone());
591 } else {
592 let mut cmp = Vec::new();
593 let p_bytes = prev_split[i].as_bytes();
594 let mut p_idx = 0;
595
596 for &byte in parts[i] {
597 let c = byte as i8;
598 if c >= 0 {
599 cmp.push(byte);
600 p_idx += 1;
601 } else {
602 let count = -c as usize;
604 cmp.extend_from_slice(&p_bytes[p_idx..p_idx + count]);
605 p_idx += count;
606 }
607 }
608
609 let component = String::from_utf8(cmp).expect("Invalid UTF-8 in component");
610 prev_split[i] = component.clone();
611 dec_parts.push(component);
612 }
613 }
614
615 dec_parts.join(" ")
616 }
617
618 fn serialize_contig_names(&self, id_from: usize, id_to: usize) -> Vec<u8> {
620 let mut data = Vec::new();
621
622 CollectionVarInt::encode(&mut data, (id_to - id_from) as u32);
623
624 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
625 eprintln!("SERIALIZE_CONTIG_NAMES: samples {id_from}..{id_to}");
626 }
627
628 for (sample_idx, sample) in self.sample_desc[id_from..id_to].iter().enumerate() {
629 CollectionVarInt::encode(&mut data, sample.contigs.len() as u32);
630
631 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
632 eprintln!(
633 " Sample {} ({}): {} contigs",
634 id_from + sample_idx,
635 sample.name,
636 sample.contigs.len()
637 );
638 }
639
640 let mut prev_split = Vec::new();
641
642 for (contig_idx, contig) in sample.contigs.iter().enumerate() {
643 let curr_split = Self::split_string(&contig.name);
644 let before_len = data.len();
645
646 if curr_split.len() != prev_split.len() {
647 CollectionVarInt::encode_string(&mut data, &contig.name);
648
649 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
650 eprintln!(
651 " Contig {}: '{}' (FULL) -> {} bytes",
652 contig_idx,
653 contig.name,
654 data.len() - before_len
655 );
656 }
657 } else {
658 let enc_bytes = Self::encode_split(&prev_split, &curr_split);
659 data.extend_from_slice(&enc_bytes);
661 data.push(0);
662
663 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
664 eprintln!(" Contig {}: '{}' (DELTA prev='{}') -> enc_bytes={:?}, {} bytes total",
665 contig_idx, contig.name, prev_split.join(" "), enc_bytes, data.len() - before_len);
666 }
667 }
668
669 prev_split = curr_split;
670 }
671 }
672
673 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
674 eprintln!(" Total serialized size: {} bytes", data.len());
675 }
676
677 data
678 }
679
680 fn decode_bytes_string(ptr: &mut &[u8]) -> Result<Vec<u8>> {
682 let end = ptr
683 .iter()
684 .position(|&b| b == 0)
685 .context("Null terminator not found in bytes string")?;
686
687 let bytes = ptr[..end].to_vec();
688 *ptr = &ptr[end + 1..];
689 Ok(bytes)
690 }
691
692 fn deserialize_contig_names(&mut self, data: &[u8], i_sample: usize) -> Result<()> {
694 let mut ptr = data;
695
696 let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr)? as usize;
697
698 for i in 0..no_samples_in_curr_batch {
699 let no_contigs = CollectionVarInt::decode(&mut ptr)? as usize;
700
701 let curr_sample = &mut self.sample_desc[i_sample + i];
702 curr_sample.contigs.clear();
703 curr_sample.contigs.reserve(no_contigs);
704
705 let mut prev_split = Vec::new();
706
707 for _ in 0..no_contigs {
708 let enc_bytes = Self::decode_bytes_string(&mut ptr)?;
709
710 let name = if let Ok(enc_str) = std::str::from_utf8(&enc_bytes) {
712 let curr_split = Self::split_string(enc_str);
713
714 if prev_split.is_empty() || curr_split.len() != prev_split.len() {
715 prev_split = curr_split;
717 enc_str.to_string()
718 } else {
719 let decoded = Self::decode_split(&mut prev_split, &enc_bytes);
721 prev_split = Self::split_string(&decoded);
722 decoded
723 }
724 } else {
725 if prev_split.is_empty() {
727 anyhow::bail!("Cannot decode non-UTF8 bytes without previous split");
728 }
729 let decoded = Self::decode_split(&mut prev_split, &enc_bytes);
730 prev_split = Self::split_string(&decoded);
731 decoded
732 };
733
734 curr_sample.contigs.push(ContigDesc::new(name));
735 }
736 }
737
738 self.no_samples_in_last_batch = no_samples_in_curr_batch;
739
740 Ok(())
741 }
742
743 fn clear_in_group_ids(&mut self) {
745 self.in_group_ids.clear();
746 }
747
748 fn get_in_group_id(&self, pos: usize) -> i32 {
750 self.in_group_ids.get(pos).copied().unwrap_or(-1)
751 }
752
753 fn set_in_group_id(&mut self, pos: usize, val: i32) {
755 if pos >= self.in_group_ids.len() {
756 self.in_group_ids
757 .resize((pos as f64 * 1.2) as usize + 1, -1);
758 }
759 self.in_group_ids[pos] = val;
760 }
761
762 fn serialize_contig_details(&mut self, id_from: usize, id_to: usize) -> [Vec<u8>; 5] {
764 let mut v_data: [Vec<u8>; 5] = Default::default();
765
766 CollectionVarInt::encode(&mut v_data[0], (id_to - id_from) as u32);
767
768 self.clear_in_group_ids();
769
770 let pred_raw_length = self.segment_size + self.kmer_length;
771
772 let mut encoded_segments = Vec::new();
775
776 for sample_idx in id_from..id_to {
777 let mut sample_encoded = Vec::new();
778
779 for contig_idx in 0..self.sample_desc[sample_idx].contigs.len() {
780 let mut contig_encoded = Vec::new();
781
782 for seg_idx in 0..self.sample_desc[sample_idx].contigs[contig_idx]
783 .segments
784 .len()
785 {
786 let seg = &self.sample_desc[sample_idx].contigs[contig_idx].segments[seg_idx];
787 let prev_in_group_id = self.get_in_group_id(seg.group_id as usize);
788
789 let e_group_id = seg.group_id;
790 let e_in_group_id = if prev_in_group_id == -1 {
791 seg.in_group_id
792 } else if seg.in_group_id == 0 {
793 0
794 } else if seg.in_group_id as i32 == prev_in_group_id + 1 {
795 1
796 } else {
797 zigzag_encode(seg.in_group_id as u64, (prev_in_group_id + 1) as u64) as u32
798 + 1
799 };
800
801 let e_raw_length =
802 zigzag_encode(seg.raw_length as u64, pred_raw_length as u64) as u32;
803
804 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
806 eprintln!(
807 "ENCODE: group={}, in_group_id={}, prev={}, e_in_group_id={}",
808 seg.group_id, seg.in_group_id, prev_in_group_id, e_in_group_id
809 );
810 }
811
812 contig_encoded.push((e_group_id, e_in_group_id, e_raw_length, seg.is_rev_comp));
813
814 if seg.in_group_id as i32 > prev_in_group_id && seg.in_group_id > 0 {
818 self.set_in_group_id(seg.group_id as usize, seg.in_group_id as i32);
819 }
820 }
821
822 sample_encoded.push(contig_encoded);
823 }
824
825 encoded_segments.push(sample_encoded);
826 }
827
828 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
830 eprintln!(
831 "SERIALIZE: Writing {} samples to collection",
832 encoded_segments.len()
833 );
834 }
835
836 for (sample_idx, sample_encoded) in encoded_segments.iter().enumerate() {
837 let before_len = v_data[0].len();
838 CollectionVarInt::encode(&mut v_data[0], sample_encoded.len() as u32);
839 let after_len = v_data[0].len();
840
841 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
842 eprintln!(" Sample {}: {} contigs", sample_idx, sample_encoded.len());
843 eprintln!(
844 " Wrote bytes [{}..{}]: {:?}",
845 before_len,
846 after_len,
847 &v_data[0][before_len..after_len]
848 );
849 }
850
851 for (contig_idx, contig_encoded) in sample_encoded.iter().enumerate() {
852 let before_len = v_data[0].len();
853 CollectionVarInt::encode(&mut v_data[0], contig_encoded.len() as u32);
854 let after_len = v_data[0].len();
855
856 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
857 eprintln!(
858 " Contig {}: {} segments",
859 contig_idx,
860 contig_encoded.len()
861 );
862 eprintln!(
863 " Wrote bytes [{}..{}]: {:?}",
864 before_len,
865 after_len,
866 &v_data[0][before_len..after_len]
867 );
868 }
869
870 for &(e_group_id, e_in_group_id, e_raw_length, is_rev_comp) in contig_encoded {
871 CollectionVarInt::encode(&mut v_data[1], e_group_id);
872 CollectionVarInt::encode(&mut v_data[2], e_in_group_id);
873 CollectionVarInt::encode(&mut v_data[3], e_raw_length);
874 CollectionVarInt::encode(&mut v_data[4], is_rev_comp as u32);
875 }
876 }
877
878 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
879 let total_segments: usize = sample_encoded.iter().map(|c| c.len()).sum();
880 eprintln!(" Sample {sample_idx} total: {total_segments} segments");
881 }
882 }
883
884 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
885 eprintln!(
886 "Stream sizes: [0]={}, [1]={}, [2]={}, [3]={}, [4]={}",
887 v_data[0].len(),
888 v_data[1].len(),
889 v_data[2].len(),
890 v_data[3].len(),
891 v_data[4].len()
892 );
893 }
894
895 v_data
896 }
897
898 fn deserialize_contig_details(&mut self, v_data: &[Vec<u8>; 5], i_sample: usize) -> Result<()> {
900 let mut ptr0 = v_data[0].as_slice();
901
902 let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr0)? as usize;
903
904 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
905 eprintln!("DESERIALIZE: Reading {no_samples_in_curr_batch} samples from collection");
906 }
907
908 let mut structure = Vec::new();
910 for sample_idx in 0..no_samples_in_curr_batch {
911 let no_contigs = CollectionVarInt::decode(&mut ptr0)? as usize;
912 let mut contig_seg_counts = Vec::new();
913
914 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
915 eprintln!(" Sample {sample_idx}: {no_contigs} contigs");
916 }
917
918 for contig_idx in 0..no_contigs {
919 let no_segments = CollectionVarInt::decode(&mut ptr0)? as usize;
920 contig_seg_counts.push(no_segments);
921
922 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
923 eprintln!(" Contig {contig_idx}: {no_segments} segments");
924 }
925 }
926
927 structure.push(contig_seg_counts);
928 }
929
930 let mut v_det: [Vec<u32>; 5] = Default::default();
932 let mut no_items = 0;
933
934 for counts in &structure {
935 for &count in counts {
936 no_items += count;
937 }
938 }
939
940 for i in 1..5 {
941 v_det[i].reserve(no_items);
942 let mut ptr = v_data[i].as_slice();
943
944 for _ in 0..no_items {
945 v_det[i].push(CollectionVarInt::decode(&mut ptr)?);
946 }
947 }
948
949 self.clear_in_group_ids();
951
952 let pred_raw_length = self.segment_size + self.kmer_length;
953 let mut item_idx = 0;
954
955 let mut decoded_segments = Vec::new();
958
959 for contig_seg_counts in &structure {
960 let mut sample_segs = Vec::new();
961
962 for &no_segments in contig_seg_counts {
963 let mut contig_segs = Vec::new();
964
965 for _ in 0..no_segments {
966 let c_group_id = v_det[1][item_idx];
967 let prev_in_group_id = self.get_in_group_id(c_group_id as usize);
968
969 let e_in_group_id = v_det[2][item_idx];
970 let c_in_group_id = if prev_in_group_id == -1 {
971 e_in_group_id
972 } else if e_in_group_id == 0 {
973 0
974 } else if e_in_group_id == 1 {
975 (prev_in_group_id + 1) as u32
976 } else {
977 zigzag_decode(e_in_group_id as u64 - 1, (prev_in_group_id + 1) as u64)
978 as u32
979 };
980
981 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
983 eprintln!("DECODE: item={item_idx}, group={c_group_id}, e_in_group_id={e_in_group_id}, prev={prev_in_group_id}, c_in_group_id={c_in_group_id}");
984 }
985
986 let c_raw_length =
987 zigzag_decode(v_det[3][item_idx] as u64, pred_raw_length as u64) as u32;
988 let c_is_rev_comp = v_det[4][item_idx] != 0;
989
990 contig_segs.push(SegmentDesc::new(
991 c_group_id,
992 c_in_group_id,
993 c_is_rev_comp,
994 c_raw_length,
995 ));
996
997 if c_in_group_id as i32 > prev_in_group_id && c_in_group_id > 0 {
1001 self.set_in_group_id(c_group_id as usize, c_in_group_id as i32);
1002 }
1003
1004 item_idx += 1;
1005 }
1006
1007 sample_segs.push(contig_segs);
1008 }
1009
1010 decoded_segments.push(sample_segs);
1011 }
1012
1013 for (i, sample_segs) in decoded_segments.into_iter().enumerate() {
1015 let curr_sample = &mut self.sample_desc[i_sample + i];
1016
1017 for (j, contig_segs) in sample_segs.into_iter().enumerate() {
1018 curr_sample.contigs[j].segments = contig_segs;
1019 }
1020 }
1021
1022 Ok(())
1023 }
1024
1025 pub fn store_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1027 let v_tmp = self.serialize_sample_names();
1028
1029 let v_data = zstd::encode_all(&v_tmp[..], 19).context("Failed to compress sample names")?;
1030
1031 let stream_id = self
1032 .collection_samples_id
1033 .context("collection-samples stream not registered")?;
1034
1035 archive.add_part(stream_id, &v_data, v_tmp.len() as u64)?;
1036
1037 Ok(())
1038 }
1039
1040 pub fn load_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1042 let stream_id = self
1043 .collection_samples_id
1044 .context("collection-samples stream not found")?;
1045
1046 let (v_tmp, raw_size) = archive
1047 .get_part(stream_id)?
1048 .context("No sample names batch found")?;
1049
1050 let v_data = zstd::decode_all(&v_tmp[..]).context("Failed to decompress sample names")?;
1051
1052 if v_data.len() != raw_size as usize {
1053 anyhow::bail!(
1054 "Decompressed size mismatch: expected {}, got {}",
1055 raw_size,
1056 v_data.len()
1057 );
1058 }
1059
1060 self.deserialize_sample_names(&v_data)?;
1061
1062 Ok(())
1063 }
1064
1065 #[allow(clippy::needless_range_loop)]
1067 pub fn load_contig_batch(&mut self, archive: &mut Archive, id_batch: usize) -> Result<()> {
1068 let contig_stream_id = self
1070 .collection_contigs_id
1071 .context("collection-contigs stream not found")?;
1072
1073 let (v_tmp_names, raw_size_names) = archive.get_part_by_id(contig_stream_id, id_batch)?;
1074 let v_data_names =
1075 zstd::decode_all(&v_tmp_names[..]).context("Failed to decompress contig names")?;
1076
1077 if v_data_names.len() != raw_size_names as usize {
1078 anyhow::bail!(
1079 "Decompressed size mismatch for contig names: expected {}, got {}",
1080 raw_size_names,
1081 v_data_names.len()
1082 );
1083 }
1084
1085 self.deserialize_contig_names(&v_data_names, id_batch * self.batch_size)?;
1086
1087 let details_stream_id = self
1089 .collection_details_id
1090 .context("collection-details stream not found")?;
1091
1092 let (v_stream, _) = archive.get_part_by_id(details_stream_id, id_batch)?;
1093
1094 let mut ptr = v_stream.as_slice();
1096 let mut sizes: [(u32, u32); 5] = Default::default();
1097
1098 for i in 0..5 {
1099 sizes[i].0 = CollectionVarInt::decode(&mut ptr)?; sizes[i].1 = CollectionVarInt::decode(&mut ptr)?; }
1102
1103 let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1104 for i in 0..5 {
1105 v_data_details_compressed[i] = ptr[..sizes[i].1 as usize].to_vec();
1106 ptr = &ptr[sizes[i].1 as usize..];
1107 }
1108
1109 let mut v_data_details: [Vec<u8>; 5] = Default::default();
1111 for i in 0..5 {
1112 v_data_details[i] = zstd::decode_all(&v_data_details_compressed[i][..])
1113 .context(format!("Failed to decompress contig details stream {i}"))?;
1114
1115 if v_data_details[i].len() != sizes[i].0 as usize {
1116 anyhow::bail!(
1117 "Decompressed size mismatch for details stream {}: expected {}, got {}",
1118 i,
1119 sizes[i].0,
1120 v_data_details[i].len()
1121 );
1122 }
1123 }
1124
1125 self.deserialize_contig_details(&v_data_details, id_batch * self.batch_size)?;
1126
1127 Ok(())
1128 }
1129
1130 #[allow(clippy::needless_range_loop)]
1132 pub fn store_contig_batch(
1133 &mut self,
1134 archive: &mut Archive,
1135 id_from: usize,
1136 id_to: usize,
1137 ) -> Result<()> {
1138 let v_tmp_names = self.serialize_contig_names(id_from, id_to);
1140 let v_data_names =
1141 zstd::encode_all(&v_tmp_names[..], 18).context("Failed to compress contig names")?;
1142
1143 let contig_stream_id = self
1144 .collection_contigs_id
1145 .context("collection-contigs stream not registered")?;
1146
1147 archive.add_part(contig_stream_id, &v_data_names, v_tmp_names.len() as u64)?;
1148
1149 let v_data_details_raw = self.serialize_contig_details(id_from, id_to);
1151
1152 let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1153 for i in 0..5 {
1154 v_data_details_compressed[i] = zstd::encode_all(&v_data_details_raw[i][..], 19)
1155 .context(format!("Failed to compress contig details stream {i}"))?;
1156 }
1157
1158 let mut v_stream = Vec::new();
1160 for i in 0..5 {
1161 CollectionVarInt::encode(&mut v_stream, v_data_details_raw[i].len() as u32);
1162 CollectionVarInt::encode(&mut v_stream, v_data_details_compressed[i].len() as u32);
1163 }
1164 for i in 0..5 {
1165 v_stream.extend_from_slice(&v_data_details_compressed[i]);
1166 }
1167
1168 let details_stream_id = self
1169 .collection_details_id
1170 .context("collection-details stream not registered")?;
1171
1172 archive.add_part(details_stream_id, &v_stream, 0)?;
1173
1174 for sample in &mut self.sample_desc[id_from..id_to] {
1176 sample.contigs.clear();
1177 sample.contigs.shrink_to_fit();
1178 }
1179
1180 Ok(())
1181 }
1182}
1183
1184#[cfg(test)]
1185mod tests {
1186 use super::*;
1187
1188 #[test]
1189 fn test_collection_varint_1byte() {
1190 let mut data = Vec::new();
1191 CollectionVarInt::encode(&mut data, 0);
1192 CollectionVarInt::encode(&mut data, 42);
1193 CollectionVarInt::encode(&mut data, 127);
1194
1195 let mut ptr = data.as_slice();
1196 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 0);
1197 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 42);
1198 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 127);
1199 }
1200
1201 #[test]
1202 fn test_collection_varint_2byte() {
1203 let mut data = Vec::new();
1204 CollectionVarInt::encode(&mut data, 128);
1205 CollectionVarInt::encode(&mut data, 255);
1206 CollectionVarInt::encode(&mut data, 16511);
1207
1208 let mut ptr = data.as_slice();
1209 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 128);
1210 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 255);
1211 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16511);
1212 }
1213
1214 #[test]
1215 fn test_collection_varint_3byte() {
1216 let mut data = Vec::new();
1217 CollectionVarInt::encode(&mut data, 16512);
1218 CollectionVarInt::encode(&mut data, 65536);
1219 CollectionVarInt::encode(&mut data, 2113663);
1220
1221 let mut ptr = data.as_slice();
1222 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16512);
1223 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 65536);
1224 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 2113663);
1225 }
1226
1227 #[test]
1228 fn test_collection_varint_string() {
1229 let mut data = Vec::new();
1230 CollectionVarInt::encode_string(&mut data, "Hello");
1231 CollectionVarInt::encode_string(&mut data, "World");
1232
1233 let mut ptr = data.as_slice();
1234 assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "Hello");
1235 assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "World");
1236 }
1237
1238 #[test]
1239 fn test_zigzag_i64() {
1240 assert_eq!(zigzag_encode_i64(0), 0);
1241 assert_eq!(zigzag_encode_i64(1), 2);
1242 assert_eq!(zigzag_encode_i64(-1), 1);
1243 assert_eq!(zigzag_encode_i64(2), 4);
1244 assert_eq!(zigzag_encode_i64(-2), 3);
1245
1246 assert_eq!(zigzag_decode_i64(0), 0);
1247 assert_eq!(zigzag_decode_i64(2), 1);
1248 assert_eq!(zigzag_decode_i64(1), -1);
1249 assert_eq!(zigzag_decode_i64(4), 2);
1250 assert_eq!(zigzag_decode_i64(3), -2);
1251 }
1252
1253 #[test]
1254 fn test_zigzag_predictive() {
1255 assert_eq!(zigzag_encode(100, 100), 0); assert_eq!(zigzag_encode(101, 100), 2); assert_eq!(zigzag_encode(99, 100), 1); assert_eq!(zigzag_encode(200, 100), 200); assert_eq!(zigzag_decode(0, 100), 100);
1262 assert_eq!(zigzag_decode(2, 100), 101);
1263 assert_eq!(zigzag_decode(1, 100), 99);
1264 assert_eq!(zigzag_decode(200, 100), 200);
1265 }
1266
1267 #[test]
1268 fn test_segment_desc() {
1269 let seg = SegmentDesc::new(42, 7, true, 1000);
1270 assert_eq!(seg.group_id, 42);
1271 assert_eq!(seg.in_group_id, 7);
1272 assert!(seg.is_rev_comp);
1273 assert_eq!(seg.raw_length, 1000);
1274 }
1275
1276 #[test]
1277 fn test_collection_register() {
1278 let mut coll = CollectionV3::new();
1279
1280 assert!(coll.register_sample_contig("sample1", "contig1").unwrap());
1281 assert!(coll.register_sample_contig("sample1", "contig2").unwrap());
1282 assert!(coll.register_sample_contig("sample2", "contig1").unwrap());
1283
1284 assert_eq!(coll.get_no_samples(), 2);
1285
1286 let samples = coll.get_samples_list(false);
1287 assert_eq!(samples, vec!["sample1", "sample2"]);
1288 }
1289
1290 #[test]
1294 fn test_in_group_id_delta_encoding_roundtrip() {
1295 let mut coll = CollectionV3::new();
1296 coll.set_config(60000, 31, None);
1297
1298 coll.register_sample_contig("sample1", "contig1").unwrap();
1300 coll.register_sample_contig("sample1", "contig2").unwrap();
1301 coll.register_sample_contig("sample2", "contig1").unwrap();
1302
1303 let test_segments = vec![
1306 ("sample1", "contig1", 0, 93, 0, false, 61000), ("sample1", "contig1", 1, 93, 1, false, 61000), ("sample1", "contig2", 0, 93, 2, false, 61000), ("sample1", "contig2", 1, 93, 3, false, 61000), ("sample2", "contig1", 0, 93, 4, false, 61000), ("sample2", "contig1", 1, 93, 5, false, 61000), ("sample2", "contig1", 2, 93, 6, false, 61000), ];
1314
1315 for (sample, contig, place, group_id, in_group_id, is_rev_comp, raw_len) in &test_segments {
1316 coll.add_segment_placed(
1317 sample,
1318 contig,
1319 *place,
1320 *group_id,
1321 *in_group_id,
1322 *is_rev_comp,
1323 *raw_len,
1324 )
1325 .unwrap();
1326 }
1327
1328 let serialized = coll.serialize_contig_details(0, 2);
1330
1331 let mut coll2 = CollectionV3::new();
1333 coll2.set_config(60000, 31, None);
1334
1335 coll2.register_sample_contig("sample1", "contig1").unwrap();
1337 coll2.register_sample_contig("sample1", "contig2").unwrap();
1338 coll2.register_sample_contig("sample2", "contig1").unwrap();
1339
1340 coll2.deserialize_contig_details(&serialized, 0).unwrap();
1342
1343 let sample1_contig1 = coll2.get_contig_desc("sample1", "contig1").unwrap();
1345 assert_eq!(
1346 sample1_contig1.len(),
1347 2,
1348 "sample1/contig1 should have 2 segments"
1349 );
1350 assert_eq!(
1351 sample1_contig1[0].in_group_id, 0,
1352 "sample1/contig1 segment 0 should have in_group_id=0"
1353 );
1354 assert_eq!(
1355 sample1_contig1[1].in_group_id, 1,
1356 "sample1/contig1 segment 1 should have in_group_id=1"
1357 );
1358
1359 let sample1_contig2 = coll2.get_contig_desc("sample1", "contig2").unwrap();
1360 assert_eq!(
1361 sample1_contig2.len(),
1362 2,
1363 "sample1/contig2 should have 2 segments"
1364 );
1365 assert_eq!(
1366 sample1_contig2[0].in_group_id, 2,
1367 "sample1/contig2 segment 0 should have in_group_id=2"
1368 );
1369 assert_eq!(
1370 sample1_contig2[1].in_group_id, 3,
1371 "sample1/contig2 segment 1 should have in_group_id=3"
1372 );
1373
1374 let sample2_contig1 = coll2.get_contig_desc("sample2", "contig1").unwrap();
1375 assert_eq!(
1376 sample2_contig1.len(),
1377 3,
1378 "sample2/contig1 should have 3 segments"
1379 );
1380 assert_eq!(
1381 sample2_contig1[0].in_group_id, 4,
1382 "sample2/contig1 segment 0 should have in_group_id=4"
1383 );
1384 assert_eq!(
1385 sample2_contig1[1].in_group_id, 5,
1386 "sample2/contig1 segment 1 should have in_group_id=5"
1387 );
1388 assert_eq!(
1389 sample2_contig1[2].in_group_id, 6,
1390 "sample2/contig1 segment 2 should have in_group_id=6"
1391 );
1392 }
1393}