1use crate::Archive;
5use anyhow::{Context, Result};
6use std::collections::HashMap;
7
8#[derive(Debug, Clone, Copy, PartialEq)]
34pub struct SegmentDesc {
35 pub group_id: u32,
36 pub in_group_id: u32, pub is_rev_comp: bool,
38 pub raw_length: u32,
39}
40
41impl SegmentDesc {
42 pub fn new(group_id: u32, in_group_id: u32, is_rev_comp: bool, raw_length: u32) -> Self {
43 SegmentDesc {
44 group_id,
45 in_group_id,
46 is_rev_comp,
47 raw_length,
48 }
49 }
50
51 pub fn empty() -> Self {
52 SegmentDesc {
53 group_id: u32::MAX,
54 in_group_id: u32::MAX,
55 is_rev_comp: false,
56 raw_length: 0,
57 }
58 }
59}
60
61#[derive(Debug, Clone)]
63pub struct ContigDesc {
64 pub name: String,
65 pub segments: Vec<SegmentDesc>,
66}
67
68impl ContigDesc {
69 pub fn new(name: String) -> Self {
70 ContigDesc {
71 name,
72 segments: Vec::new(),
73 }
74 }
75}
76
77#[derive(Debug, Clone)]
79pub struct SampleDesc {
80 pub name: String,
81 pub contigs: Vec<ContigDesc>,
82}
83
84impl SampleDesc {
85 pub fn new(name: String) -> Self {
86 SampleDesc {
87 name,
88 contigs: Vec::new(),
89 }
90 }
91}
92
93pub struct CollectionVarInt;
102
103impl CollectionVarInt {
104 const THR_1: u32 = 1 << 7;
105 const THR_2: u32 = Self::THR_1 + (1 << 14);
106 const THR_3: u32 = Self::THR_2 + (1 << 21);
107 const THR_4: u32 = Self::THR_3 + (1 << 28);
108
109 const PREF_1: u8 = 0;
110 const PREF_2: u8 = 0b10000000u8;
111 const PREF_3: u8 = 0b11000000u8;
112 const PREF_4: u8 = 0b11100000u8;
113 const PREF_5: u8 = 0b11110000u8;
114
115 const MASK_1: u8 = 0b10000000u8;
116 const MASK_2: u8 = 0b11000000u8;
117 const MASK_3: u8 = 0b11100000u8;
118 const MASK_4: u8 = 0b11110000u8;
119
120 pub fn encode(data: &mut Vec<u8>, num: u32) {
121 if num < Self::THR_1 {
122 data.push(Self::PREF_1 + num as u8);
123 } else if num < Self::THR_2 {
124 let num = num - Self::THR_1;
125 data.push(Self::PREF_2 + (num >> 8) as u8);
126 data.push((num & 0xff) as u8);
127 } else if num < Self::THR_3 {
128 let num = num - Self::THR_2;
129 data.push(Self::PREF_3 + (num >> 16) as u8);
130 data.push(((num >> 8) & 0xff) as u8);
131 data.push((num & 0xff) as u8);
132 } else if num < Self::THR_4 {
133 let num = num - Self::THR_3;
134 data.push(Self::PREF_4 + (num >> 24) as u8);
135 data.push(((num >> 16) & 0xff) as u8);
136 data.push(((num >> 8) & 0xff) as u8);
137 data.push((num & 0xff) as u8);
138 } else {
139 let num = num - Self::THR_4;
140 data.push(Self::PREF_5);
141 data.push(((num >> 24) & 0xff) as u8);
142 data.push(((num >> 16) & 0xff) as u8);
143 data.push(((num >> 8) & 0xff) as u8);
144 data.push((num & 0xff) as u8);
145 }
146 }
147
148 pub fn decode(ptr: &mut &[u8]) -> Result<u32> {
149 if ptr.is_empty() {
150 anyhow::bail!("Unexpected end of data while decoding varint");
151 }
152
153 let first = ptr[0];
154
155 if (first & Self::MASK_1) == Self::PREF_1 {
156 let num = (first - Self::PREF_1) as u32;
157 *ptr = &ptr[1..];
158 Ok(num)
159 } else if (first & Self::MASK_2) == Self::PREF_2 {
160 if ptr.len() < 2 {
161 anyhow::bail!("Unexpected end of data while decoding 2-byte varint");
162 }
163 let num =
164 ((ptr[0] as u32) << 8) + ptr[1] as u32 + Self::THR_1 - ((Self::PREF_2 as u32) << 8);
165 *ptr = &ptr[2..];
166 Ok(num)
167 } else if (first & Self::MASK_3) == Self::PREF_3 {
168 if ptr.len() < 3 {
169 anyhow::bail!("Unexpected end of data while decoding 3-byte varint");
170 }
171 let num =
172 ((ptr[0] as u32) << 16) + ((ptr[1] as u32) << 8) + ptr[2] as u32 + Self::THR_2
173 - ((Self::PREF_3 as u32) << 16);
174 *ptr = &ptr[3..];
175 Ok(num)
176 } else if (first & Self::MASK_4) == Self::PREF_4 {
177 if ptr.len() < 4 {
178 anyhow::bail!("Unexpected end of data while decoding 4-byte varint");
179 }
180 let num = ((ptr[0] as u32) << 24)
181 + ((ptr[1] as u32) << 16)
182 + ((ptr[2] as u32) << 8)
183 + ptr[3] as u32
184 + Self::THR_3
185 - ((Self::PREF_4 as u32) << 24);
186 *ptr = &ptr[4..];
187 Ok(num)
188 } else {
189 if ptr.len() < 5 {
191 anyhow::bail!("Unexpected end of data while decoding 5-byte varint");
192 }
193 let mut num = ptr[1] as u32;
194 num <<= 8;
195 num += ptr[2] as u32;
196 num <<= 8;
197 num += ptr[3] as u32;
198 num <<= 8;
199 num += ptr[4] as u32;
200 num += Self::THR_4;
201 *ptr = &ptr[5..];
202 Ok(num)
203 }
204 }
205
206 pub fn encode_string(data: &mut Vec<u8>, s: &str) {
207 data.extend_from_slice(s.as_bytes());
208 data.push(0);
209 }
210
211 pub fn decode_string(ptr: &mut &[u8]) -> Result<String> {
212 let end = ptr
213 .iter()
214 .position(|&b| b == 0)
215 .context("Null terminator not found in string")?;
216
217 let s = String::from_utf8(ptr[..end].to_vec()).context("Invalid UTF-8 in string")?;
218
219 *ptr = &ptr[end + 1..];
220 Ok(s)
221 }
222}
223
224pub fn zigzag_encode_i64(x: i64) -> u64 {
226 if x >= 0 {
227 (2 * x) as u64
228 } else {
229 (2 * (-x) - 1) as u64
230 }
231}
232
233pub fn zigzag_decode_i64(x: u64) -> i64 {
234 if x & 1 != 0 {
235 -(x.div_ceil(2) as i64)
236 } else {
237 (x / 2) as i64
238 }
239}
240
241pub fn zigzag_encode(x_curr: u64, x_prev: u64) -> u64 {
243 if x_curr < x_prev {
244 2 * (x_prev - x_curr) - 1
245 } else if x_curr < 2 * x_prev {
246 2 * (x_curr - x_prev)
247 } else {
248 x_curr
249 }
250}
251
252pub fn zigzag_decode(x_val: u64, x_prev: u64) -> u64 {
253 if x_val >= 2 * x_prev {
254 x_val
255 } else if x_val & 1 != 0 {
256 (2 * x_prev - x_val) / 2
257 } else {
258 (x_val + 2 * x_prev) / 2
259 }
260}
261
262pub struct CollectionV3 {
264 sample_desc: Vec<SampleDesc>,
265 sample_ids: HashMap<String, usize>,
266
267 collection_samples_id: Option<usize>,
269 collection_contigs_id: Option<usize>,
270 collection_details_id: Option<usize>,
271
272 batch_size: usize,
274 segment_size: u32,
275 kmer_length: u32,
276
277 prev_sample_name: String,
279 #[allow(dead_code)]
280 placing_sample_name: String,
281 #[allow(dead_code)]
282 placing_sample_id: usize,
283 no_samples_in_last_batch: usize,
284 samples_loaded: usize,
286
287 in_group_ids: Vec<i32>,
289}
290
291impl Default for CollectionV3 {
292 fn default() -> Self {
293 Self::new()
294 }
295}
296
297impl CollectionV3 {
298 pub fn new() -> Self {
299 CollectionV3 {
300 sample_desc: Vec::new(),
301 sample_ids: HashMap::new(),
302 collection_samples_id: None,
303 collection_contigs_id: None,
304 collection_details_id: None,
305 batch_size: 1 << 20, segment_size: 0,
307 kmer_length: 0,
308 prev_sample_name: String::new(),
309 placing_sample_name: String::new(),
310 placing_sample_id: 0,
311 no_samples_in_last_batch: 0,
312 samples_loaded: 0,
313 in_group_ids: Vec::new(),
314 }
315 }
316
317 pub fn set_config(&mut self, segment_size: u32, kmer_length: u32, batch_size: Option<usize>) {
318 self.segment_size = segment_size;
319 self.kmer_length = kmer_length;
320 if let Some(bs) = batch_size {
321 self.batch_size = bs;
322 }
323 }
324
325 pub fn prepare_for_compression(&mut self, archive: &mut Archive) -> Result<()> {
326 self.collection_samples_id = Some(archive.register_stream("collection-samples"));
327 self.collection_contigs_id = Some(archive.register_stream("collection-contigs"));
328 self.collection_details_id = Some(archive.register_stream("collection-details"));
329 Ok(())
330 }
331
332 pub fn prepare_for_decompression(&mut self, archive: &Archive) -> Result<()> {
333 self.collection_samples_id = archive.get_stream_id("collection-samples");
334 self.collection_contigs_id = archive.get_stream_id("collection-contigs");
335 self.collection_details_id = archive.get_stream_id("collection-details");
336
337 if self.collection_samples_id.is_none() {
338 anyhow::bail!("collection-samples stream not found in archive");
339 }
340 if self.collection_contigs_id.is_none() {
341 anyhow::bail!("collection-contigs stream not found in archive");
342 }
343 if self.collection_details_id.is_none() {
344 anyhow::bail!("collection-details stream not found in archive");
345 }
346
347 Ok(())
348 }
349
350 pub fn register_sample_contig(&mut self, sample_name: &str, contig_name: &str) -> Result<bool> {
352 let mut stored_sample_name = sample_name.to_string();
353
354 if sample_name.is_empty() {
355 stored_sample_name = Self::extract_contig_name(contig_name);
356 }
357
358 let sample_id = if let Some(&id) = self.sample_ids.get(&stored_sample_name) {
360 id
362 } else {
363 let id = self.sample_ids.len();
365 self.sample_ids.insert(stored_sample_name.clone(), id);
366 self.sample_desc
367 .push(SampleDesc::new(stored_sample_name.clone()));
368 id
369 };
370
371 self.prev_sample_name = stored_sample_name;
372
373 let sample = &mut self.sample_desc[sample_id];
375 if !sample.contigs.iter().any(|c| c.name == contig_name) {
376 sample
377 .contigs
378 .push(ContigDesc::new(contig_name.to_string()));
379 Ok(true)
380 } else {
381 Ok(false) }
383 }
384
385 #[allow(clippy::too_many_arguments)]
387 pub fn add_segment_placed(
388 &mut self,
389 sample_name: &str,
390 contig_name: &str,
391 place: usize,
392 group_id: u32,
393 in_group_id: u32,
394 is_rev_comp: bool,
395 raw_length: u32,
396 ) -> Result<()> {
397 let mut stored_sample_name = sample_name.to_string();
398
399 if sample_name.is_empty() {
400 stored_sample_name = Self::extract_contig_name(contig_name);
401 }
402
403 let sample_id = *self
407 .sample_ids
408 .get(&stored_sample_name)
409 .context(format!("Sample not found: {stored_sample_name}"))?;
410
411 if std::env::var("RAGC_DEBUG_REGISTER").is_ok() {
412 eprintln!("REGISTER: sample='{stored_sample_name}' (id={sample_id}), contig='{contig_name}', place={place}");
413 }
414
415 let sample = &mut self.sample_desc[sample_id];
416 for contig in &mut sample.contigs {
417 if contig.name == contig_name {
418 if place >= contig.segments.len() {
419 contig.segments.resize(place + 1, SegmentDesc::empty());
420 }
421 contig.segments[place] =
422 SegmentDesc::new(group_id, in_group_id, is_rev_comp, raw_length);
423 return Ok(());
424 }
425 }
426
427 anyhow::bail!("Contig {contig_name} not found in sample {stored_sample_name}");
428 }
429
430 pub fn get_samples_list(&self, sorted: bool) -> Vec<String> {
432 let mut samples: Vec<String> = self.sample_desc.iter().map(|s| s.name.clone()).collect();
433 if sorted {
434 samples.sort();
435 }
436 samples
437 }
438
439 pub fn get_no_samples(&self) -> usize {
441 self.sample_desc.len()
442 }
443
444 pub fn get_no_contigs(&self, sample_name: &str) -> Option<usize> {
446 self.sample_ids
447 .get(sample_name)
448 .map(|&id| self.sample_desc[id].contigs.len())
449 }
450
451 pub fn get_no_contig_batches(&self, archive: &Archive) -> Result<usize> {
453 let contig_stream_id = self
454 .collection_contigs_id
455 .context("collection-contigs stream not found")?;
456 Ok(archive.get_num_parts(contig_stream_id))
457 }
458
459 pub fn get_contig_list(&self, sample_name: &str) -> Option<Vec<String>> {
461 self.sample_ids.get(sample_name).map(|&id| {
462 self.sample_desc[id]
463 .contigs
464 .iter()
465 .map(|c| c.name.clone())
466 .collect()
467 })
468 }
469
470 pub fn get_sample_desc(&self, sample_name: &str) -> Option<Vec<(String, Vec<SegmentDesc>)>> {
472 self.sample_ids.get(sample_name).map(|&id| {
473 self.sample_desc[id]
474 .contigs
475 .iter()
476 .map(|c| (c.name.clone(), c.segments.clone()))
477 .collect()
478 })
479 }
480
481 pub fn get_contig_desc(
483 &self,
484 sample_name: &str,
485 contig_name: &str,
486 ) -> Option<Vec<SegmentDesc>> {
487 self.sample_ids.get(sample_name).and_then(|&id| {
488 self.sample_desc[id]
489 .contigs
490 .iter()
491 .find(|c| c.name == contig_name)
492 .map(|c| c.segments.clone())
493 })
494 }
495
496 pub fn get_params(&self) -> (u32, u32) {
498 (self.segment_size, self.kmer_length)
499 }
500
501 fn extract_contig_name(s: &str) -> String {
503 s.split_whitespace().next().unwrap_or(s).to_string()
504 }
505
506 fn serialize_sample_names(&self) -> Vec<u8> {
510 let mut data = Vec::new();
511 CollectionVarInt::encode(&mut data, self.sample_desc.len() as u32);
512
513 for sample in &self.sample_desc {
514 CollectionVarInt::encode_string(&mut data, &sample.name);
515 }
516
517 data
518 }
519
520 fn deserialize_sample_names(&mut self, data: &[u8]) -> Result<()> {
522 let mut ptr = data;
523
524 let no_samples = CollectionVarInt::decode(&mut ptr)?;
525
526 self.sample_desc.clear();
527 self.sample_ids.clear();
528
529 for i in 0..no_samples {
530 let name = CollectionVarInt::decode_string(&mut ptr)?;
531 self.sample_ids.insert(name.clone(), i as usize);
532 self.sample_desc.push(SampleDesc::new(name));
533 }
534
535 Ok(())
536 }
537
538 fn split_string(s: &str) -> Vec<String> {
540 s.split(' ').map(|s| s.to_string()).collect()
541 }
542
543 fn encode_split(prev_split: &[String], curr_split: &[String]) -> Vec<u8> {
545 let mut enc = Vec::new();
546
547 for i in 0..curr_split.len() {
548 if prev_split[i] == curr_split[i] {
549 enc.push((-127i8) as u8); } else if prev_split[i].len() != curr_split[i].len() {
551 enc.extend_from_slice(curr_split[i].as_bytes());
552 } else {
553 let mut cnt: i8 = 0;
555 let p_bytes = prev_split[i].as_bytes();
556 let c_bytes = curr_split[i].as_bytes();
557
558 for j in 0..c_bytes.len() {
559 if p_bytes[j] == c_bytes[j] {
560 if cnt == 100 {
561 enc.push((-cnt) as u8); cnt = 1;
563 } else {
564 cnt += 1;
565 }
566 } else {
567 if cnt > 0 {
568 enc.push((-cnt) as u8); cnt = 0;
570 }
571 enc.push(c_bytes[j]);
572 }
573 }
574
575 if cnt > 0 {
576 enc.push((-cnt) as u8); }
578 }
579
580 enc.push(b' ');
581 }
582
583 if enc.last() == Some(&b' ') {
585 enc.pop();
586 }
587
588 enc
589 }
590
591 #[allow(clippy::ptr_arg)]
595 fn decode_split_bytes(prev_split: &Vec<Vec<u8>>, curr_split: &mut Vec<Vec<u8>>) -> String {
596 let mut dec = Vec::new();
597
598 for i in 0..curr_split.len() {
599 if curr_split[i].len() == 1 && (curr_split[i][0] as i8) == -127 {
600 dec.extend_from_slice(&prev_split[i]);
602 curr_split[i] = prev_split[i].clone();
603 } else {
604 let mut cmp = Vec::new();
605 let p_bytes = &prev_split[i];
606 let mut p_idx = 0;
607
608 for &byte in &curr_split[i] {
609 let c = byte as i8;
610 if c >= 0 {
611 cmp.push(byte);
613 p_idx += 1;
614 } else {
615 let count = -c as usize;
617 cmp.extend_from_slice(&p_bytes[p_idx..p_idx + count]);
618 p_idx += count;
619 }
620 }
621
622 dec.extend_from_slice(&cmp);
623 curr_split[i] = cmp;
624 }
625
626 dec.push(b' ');
627 }
628
629 dec.pop(); String::from_utf8(dec).expect("Invalid UTF-8 in decoded contig name")
631 }
632
633 fn serialize_contig_names(&self, id_from: usize, id_to: usize) -> Vec<u8> {
635 let mut data = Vec::new();
636
637 CollectionVarInt::encode(&mut data, (id_to - id_from) as u32);
638
639 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
640 eprintln!("SERIALIZE_CONTIG_NAMES: samples {id_from}..{id_to}");
641 }
642
643 for (sample_idx, sample) in self.sample_desc[id_from..id_to].iter().enumerate() {
644 CollectionVarInt::encode(&mut data, sample.contigs.len() as u32);
645
646 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
647 eprintln!(
648 " Sample {} ({}): {} contigs",
649 id_from + sample_idx,
650 sample.name,
651 sample.contigs.len()
652 );
653 }
654
655 let mut prev_split = Vec::new();
656
657 for (contig_idx, contig) in sample.contigs.iter().enumerate() {
658 let curr_split = Self::split_string(&contig.name);
659 let before_len = data.len();
660
661 if curr_split.len() != prev_split.len() {
662 CollectionVarInt::encode_string(&mut data, &contig.name);
663
664 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
665 eprintln!(
666 " Contig {}: '{}' (FULL) -> {} bytes",
667 contig_idx,
668 contig.name,
669 data.len() - before_len
670 );
671 }
672 } else {
673 let enc_bytes = Self::encode_split(&prev_split, &curr_split);
674 data.extend_from_slice(&enc_bytes);
676 data.push(0);
677
678 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
679 eprintln!(" Contig {}: '{}' (DELTA prev='{}') -> enc_bytes={:?}, {} bytes total",
680 contig_idx, contig.name, prev_split.join(" "), enc_bytes, data.len() - before_len);
681 }
682 }
683
684 prev_split = curr_split;
685 }
686 }
687
688 if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
689 eprintln!(" Total serialized size: {} bytes", data.len());
690 }
691
692 data
693 }
694
695 fn decode_bytes_string(ptr: &mut &[u8]) -> Result<Vec<u8>> {
697 let end = ptr
698 .iter()
699 .position(|&b| b == 0)
700 .context("Null terminator not found in bytes string")?;
701
702 let bytes = ptr[..end].to_vec();
703 *ptr = &ptr[end + 1..];
704 Ok(bytes)
705 }
706
707 fn deserialize_contig_names(&mut self, data: &[u8], i_sample: usize) -> Result<()> {
709 let mut ptr = data;
710
711 let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr)? as usize;
712
713 for i in 0..no_samples_in_curr_batch {
714 let no_contigs = CollectionVarInt::decode(&mut ptr)? as usize;
715
716 let curr_sample = &mut self.sample_desc[i_sample + i];
717 curr_sample.contigs.clear();
718 curr_sample.contigs.reserve(no_contigs);
719
720 let mut prev_split: Vec<Vec<u8>> = Vec::new();
721
722 for _ in 0..no_contigs {
723 let enc_bytes = Self::decode_bytes_string(&mut ptr)?;
724
725 let mut curr_split: Vec<Vec<u8>> = enc_bytes
740 .split(|&b| b == b' ')
741 .map(|s| s.to_vec())
742 .collect();
743
744 let name = if prev_split.is_empty() || curr_split.len() != prev_split.len() {
745 String::from_utf8_lossy(&enc_bytes).to_string()
747 } else {
748 Self::decode_split_bytes(&prev_split, &mut curr_split)
750 };
751
752 prev_split = curr_split;
754
755 curr_sample.contigs.push(ContigDesc::new(name));
756 }
757 }
758
759 self.no_samples_in_last_batch = no_samples_in_curr_batch;
760
761 Ok(())
762 }
763
764 fn clear_in_group_ids(&mut self) {
766 self.in_group_ids.clear();
767 }
768
769 fn get_in_group_id(&self, pos: usize) -> i32 {
771 self.in_group_ids.get(pos).copied().unwrap_or(-1)
772 }
773
774 fn set_in_group_id(&mut self, pos: usize, val: i32) {
776 if pos >= self.in_group_ids.len() {
777 self.in_group_ids
778 .resize((pos as f64 * 1.2) as usize + 1, -1);
779 }
780 self.in_group_ids[pos] = val;
781 }
782
783 fn serialize_contig_details(&mut self, id_from: usize, id_to: usize) -> [Vec<u8>; 5] {
785 let mut v_data: [Vec<u8>; 5] = Default::default();
786
787 CollectionVarInt::encode(&mut v_data[0], (id_to - id_from) as u32);
788
789 self.clear_in_group_ids();
790
791 let pred_raw_length = self.segment_size + self.kmer_length;
792
793 let mut encoded_segments = Vec::new();
796
797 for sample_idx in id_from..id_to {
798 let mut sample_encoded = Vec::new();
799
800 for contig_idx in 0..self.sample_desc[sample_idx].contigs.len() {
801 let mut contig_encoded = Vec::new();
802
803 for seg_idx in 0..self.sample_desc[sample_idx].contigs[contig_idx]
804 .segments
805 .len()
806 {
807 let seg = &self.sample_desc[sample_idx].contigs[contig_idx].segments[seg_idx];
808 let prev_in_group_id = self.get_in_group_id(seg.group_id as usize);
809
810 let e_group_id = seg.group_id;
811 let e_in_group_id = if prev_in_group_id == -1 {
812 seg.in_group_id
813 } else if seg.in_group_id == 0 {
814 0
815 } else if seg.in_group_id as i32 == prev_in_group_id + 1 {
816 1
817 } else {
818 zigzag_encode(seg.in_group_id as u64, (prev_in_group_id + 1) as u64) as u32
819 + 1
820 };
821
822 let e_raw_length =
823 zigzag_encode(seg.raw_length as u64, pred_raw_length as u64) as u32;
824
825 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
827 eprintln!(
828 "ENCODE: group={}, in_group_id={}, prev={}, e_in_group_id={}",
829 seg.group_id, seg.in_group_id, prev_in_group_id, e_in_group_id
830 );
831 }
832
833 contig_encoded.push((e_group_id, e_in_group_id, e_raw_length, seg.is_rev_comp));
834
835 if seg.in_group_id as i32 > prev_in_group_id && seg.in_group_id > 0 {
839 self.set_in_group_id(seg.group_id as usize, seg.in_group_id as i32);
840 }
841 }
842
843 sample_encoded.push(contig_encoded);
844 }
845
846 encoded_segments.push(sample_encoded);
847 }
848
849 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
851 eprintln!(
852 "SERIALIZE: Writing {} samples to collection",
853 encoded_segments.len()
854 );
855 }
856
857 for (sample_idx, sample_encoded) in encoded_segments.iter().enumerate() {
858 let before_len = v_data[0].len();
859 CollectionVarInt::encode(&mut v_data[0], sample_encoded.len() as u32);
860 let after_len = v_data[0].len();
861
862 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
863 eprintln!(" Sample {}: {} contigs", sample_idx, sample_encoded.len());
864 eprintln!(
865 " Wrote bytes [{}..{}]: {:?}",
866 before_len,
867 after_len,
868 &v_data[0][before_len..after_len]
869 );
870 }
871
872 for (contig_idx, contig_encoded) in sample_encoded.iter().enumerate() {
873 let before_len = v_data[0].len();
874 CollectionVarInt::encode(&mut v_data[0], contig_encoded.len() as u32);
875 let after_len = v_data[0].len();
876
877 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
878 eprintln!(
879 " Contig {}: {} segments",
880 contig_idx,
881 contig_encoded.len()
882 );
883 eprintln!(
884 " Wrote bytes [{}..{}]: {:?}",
885 before_len,
886 after_len,
887 &v_data[0][before_len..after_len]
888 );
889 }
890
891 for &(e_group_id, e_in_group_id, e_raw_length, is_rev_comp) in contig_encoded {
892 CollectionVarInt::encode(&mut v_data[1], e_group_id);
893 CollectionVarInt::encode(&mut v_data[2], e_in_group_id);
894 CollectionVarInt::encode(&mut v_data[3], e_raw_length);
895 CollectionVarInt::encode(&mut v_data[4], is_rev_comp as u32);
896 }
897 }
898
899 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
900 let total_segments: usize = sample_encoded.iter().map(|c| c.len()).sum();
901 eprintln!(" Sample {sample_idx} total: {total_segments} segments");
902 }
903 }
904
905 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
906 eprintln!(
907 "Stream sizes: [0]={}, [1]={}, [2]={}, [3]={}, [4]={}",
908 v_data[0].len(),
909 v_data[1].len(),
910 v_data[2].len(),
911 v_data[3].len(),
912 v_data[4].len()
913 );
914 }
915
916 v_data
917 }
918
919 fn deserialize_contig_details(&mut self, v_data: &[Vec<u8>; 5], i_sample: usize) -> Result<()> {
921 let mut ptr0 = v_data[0].as_slice();
922
923 let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr0)? as usize;
924
925 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
926 eprintln!("DESERIALIZE: Reading {no_samples_in_curr_batch} samples from collection");
927 }
928
929 let mut structure = Vec::new();
931 for sample_idx in 0..no_samples_in_curr_batch {
932 let no_contigs = CollectionVarInt::decode(&mut ptr0)? as usize;
933 let mut contig_seg_counts = Vec::new();
934
935 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
936 eprintln!(" Sample {sample_idx}: {no_contigs} contigs");
937 }
938
939 for contig_idx in 0..no_contigs {
940 let no_segments = CollectionVarInt::decode(&mut ptr0)? as usize;
941 contig_seg_counts.push(no_segments);
942
943 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
944 eprintln!(" Contig {contig_idx}: {no_segments} segments");
945 }
946 }
947
948 structure.push(contig_seg_counts);
949 }
950
951 let mut v_det: [Vec<u32>; 5] = Default::default();
953 let mut no_items = 0;
954
955 for counts in &structure {
956 for &count in counts {
957 no_items += count;
958 }
959 }
960
961 for i in 1..5 {
962 v_det[i].reserve(no_items);
963 let mut ptr = v_data[i].as_slice();
964
965 for _ in 0..no_items {
966 v_det[i].push(CollectionVarInt::decode(&mut ptr)?);
967 }
968 }
969
970 self.clear_in_group_ids();
972
973 let pred_raw_length = self.segment_size + self.kmer_length;
974 let mut item_idx = 0;
975
976 let mut decoded_segments = Vec::new();
979
980 for contig_seg_counts in &structure {
981 let mut sample_segs = Vec::new();
982
983 for &no_segments in contig_seg_counts {
984 let mut contig_segs = Vec::new();
985
986 for _ in 0..no_segments {
987 let c_group_id = v_det[1][item_idx];
988 let prev_in_group_id = self.get_in_group_id(c_group_id as usize);
989
990 let e_in_group_id = v_det[2][item_idx];
991 let c_in_group_id = if prev_in_group_id == -1 {
992 e_in_group_id
993 } else if e_in_group_id == 0 {
994 0
995 } else if e_in_group_id == 1 {
996 (prev_in_group_id + 1) as u32
997 } else {
998 zigzag_decode(e_in_group_id as u64 - 1, (prev_in_group_id + 1) as u64)
999 as u32
1000 };
1001
1002 if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
1004 eprintln!("DECODE: item={item_idx}, group={c_group_id}, e_in_group_id={e_in_group_id}, prev={prev_in_group_id}, c_in_group_id={c_in_group_id}");
1005 }
1006
1007 let c_raw_length =
1008 zigzag_decode(v_det[3][item_idx] as u64, pred_raw_length as u64) as u32;
1009 let c_is_rev_comp = v_det[4][item_idx] != 0;
1010
1011 contig_segs.push(SegmentDesc::new(
1012 c_group_id,
1013 c_in_group_id,
1014 c_is_rev_comp,
1015 c_raw_length,
1016 ));
1017
1018 if c_in_group_id as i32 > prev_in_group_id && c_in_group_id > 0 {
1022 self.set_in_group_id(c_group_id as usize, c_in_group_id as i32);
1023 }
1024
1025 item_idx += 1;
1026 }
1027
1028 sample_segs.push(contig_segs);
1029 }
1030
1031 decoded_segments.push(sample_segs);
1032 }
1033
1034 for (i, sample_segs) in decoded_segments.into_iter().enumerate() {
1036 let curr_sample = &mut self.sample_desc[i_sample + i];
1037
1038 for (j, contig_segs) in sample_segs.into_iter().enumerate() {
1039 curr_sample.contigs[j].segments = contig_segs;
1040 }
1041 }
1042
1043 Ok(())
1044 }
1045
1046 pub fn store_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1048 let v_tmp = self.serialize_sample_names();
1049
1050 let v_data = zstd::encode_all(&v_tmp[..], 19).context("Failed to compress sample names")?;
1051
1052 let stream_id = self
1053 .collection_samples_id
1054 .context("collection-samples stream not registered")?;
1055
1056 archive.add_part_buffered(stream_id, v_data, v_tmp.len() as u64);
1057
1058 Ok(())
1059 }
1060
1061 pub fn load_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1063 let stream_id = self
1064 .collection_samples_id
1065 .context("collection-samples stream not found")?;
1066
1067 let (v_tmp, raw_size) = archive
1068 .get_part(stream_id)?
1069 .context("No sample names batch found")?;
1070
1071 let v_data = zstd::decode_all(&v_tmp[..]).context("Failed to decompress sample names")?;
1072
1073 if v_data.len() != raw_size as usize {
1074 anyhow::bail!(
1075 "Decompressed size mismatch: expected {}, got {}",
1076 raw_size,
1077 v_data.len()
1078 );
1079 }
1080
1081 self.deserialize_sample_names(&v_data)?;
1082
1083 Ok(())
1084 }
1085
1086 #[allow(clippy::needless_range_loop)]
1088 pub fn load_contig_batch(&mut self, archive: &mut Archive, id_batch: usize) -> Result<()> {
1089 let i_sample = self.samples_loaded;
1092
1093 let contig_stream_id = self
1095 .collection_contigs_id
1096 .context("collection-contigs stream not found")?;
1097
1098 let (v_tmp_names, raw_size_names) = archive.get_part_by_id(contig_stream_id, id_batch)?;
1099 let v_data_names =
1100 zstd::decode_all(&v_tmp_names[..]).context("Failed to decompress contig names")?;
1101
1102 if v_data_names.len() != raw_size_names as usize {
1103 anyhow::bail!(
1104 "Decompressed size mismatch for contig names: expected {}, got {}",
1105 raw_size_names,
1106 v_data_names.len()
1107 );
1108 }
1109
1110 self.deserialize_contig_names(&v_data_names, i_sample)?;
1111
1112 let details_stream_id = self
1114 .collection_details_id
1115 .context("collection-details stream not found")?;
1116
1117 let (v_stream, _) = archive.get_part_by_id(details_stream_id, id_batch)?;
1118
1119 let mut ptr = v_stream.as_slice();
1121 let mut sizes: [(u32, u32); 5] = Default::default();
1122
1123 for i in 0..5 {
1124 sizes[i].0 = CollectionVarInt::decode(&mut ptr)?; sizes[i].1 = CollectionVarInt::decode(&mut ptr)?; }
1127
1128 let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1129 for i in 0..5 {
1130 v_data_details_compressed[i] = ptr[..sizes[i].1 as usize].to_vec();
1131 ptr = &ptr[sizes[i].1 as usize..];
1132 }
1133
1134 let mut v_data_details: [Vec<u8>; 5] = Default::default();
1136 for i in 0..5 {
1137 v_data_details[i] = zstd::decode_all(&v_data_details_compressed[i][..])
1138 .context(format!("Failed to decompress contig details stream {i}"))?;
1139
1140 if v_data_details[i].len() != sizes[i].0 as usize {
1141 anyhow::bail!(
1142 "Decompressed size mismatch for details stream {}: expected {}, got {}",
1143 i,
1144 sizes[i].0,
1145 v_data_details[i].len()
1146 );
1147 }
1148 }
1149
1150 self.deserialize_contig_details(&v_data_details, i_sample)?;
1151
1152 self.samples_loaded += self.no_samples_in_last_batch;
1154
1155 Ok(())
1156 }
1157
1158 #[allow(clippy::needless_range_loop)]
1160 pub fn store_contig_batch(
1161 &mut self,
1162 archive: &mut Archive,
1163 id_from: usize,
1164 id_to: usize,
1165 ) -> Result<()> {
1166 let v_tmp_names = self.serialize_contig_names(id_from, id_to);
1168 let v_data_names =
1169 zstd::encode_all(&v_tmp_names[..], 18).context("Failed to compress contig names")?;
1170
1171 let contig_stream_id = self
1172 .collection_contigs_id
1173 .context("collection-contigs stream not registered")?;
1174
1175 archive.add_part_buffered(contig_stream_id, v_data_names, v_tmp_names.len() as u64);
1176
1177 let v_data_details_raw = self.serialize_contig_details(id_from, id_to);
1179
1180 let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1181 for i in 0..5 {
1182 v_data_details_compressed[i] = zstd::encode_all(&v_data_details_raw[i][..], 19)
1183 .context(format!("Failed to compress contig details stream {i}"))?;
1184 }
1185
1186 let mut v_stream = Vec::new();
1188 for i in 0..5 {
1189 CollectionVarInt::encode(&mut v_stream, v_data_details_raw[i].len() as u32);
1190 CollectionVarInt::encode(&mut v_stream, v_data_details_compressed[i].len() as u32);
1191 }
1192 for i in 0..5 {
1193 v_stream.extend_from_slice(&v_data_details_compressed[i]);
1194 }
1195
1196 let details_stream_id = self
1197 .collection_details_id
1198 .context("collection-details stream not registered")?;
1199
1200 archive.add_part_buffered(details_stream_id, v_stream, 0);
1201
1202 for sample in &mut self.sample_desc[id_from..id_to] {
1204 sample.contigs.clear();
1205 sample.contigs.shrink_to_fit();
1206 }
1207
1208 Ok(())
1209 }
1210}
1211
1212#[cfg(test)]
1213mod tests {
1214 use super::*;
1215
1216 #[test]
1217 fn test_collection_varint_1byte() {
1218 let mut data = Vec::new();
1219 CollectionVarInt::encode(&mut data, 0);
1220 CollectionVarInt::encode(&mut data, 42);
1221 CollectionVarInt::encode(&mut data, 127);
1222
1223 let mut ptr = data.as_slice();
1224 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 0);
1225 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 42);
1226 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 127);
1227 }
1228
1229 #[test]
1230 fn test_collection_varint_2byte() {
1231 let mut data = Vec::new();
1232 CollectionVarInt::encode(&mut data, 128);
1233 CollectionVarInt::encode(&mut data, 255);
1234 CollectionVarInt::encode(&mut data, 16511);
1235
1236 let mut ptr = data.as_slice();
1237 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 128);
1238 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 255);
1239 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16511);
1240 }
1241
1242 #[test]
1243 fn test_collection_varint_3byte() {
1244 let mut data = Vec::new();
1245 CollectionVarInt::encode(&mut data, 16512);
1246 CollectionVarInt::encode(&mut data, 65536);
1247 CollectionVarInt::encode(&mut data, 2113663);
1248
1249 let mut ptr = data.as_slice();
1250 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16512);
1251 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 65536);
1252 assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 2113663);
1253 }
1254
1255 #[test]
1256 fn test_collection_varint_string() {
1257 let mut data = Vec::new();
1258 CollectionVarInt::encode_string(&mut data, "Hello");
1259 CollectionVarInt::encode_string(&mut data, "World");
1260
1261 let mut ptr = data.as_slice();
1262 assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "Hello");
1263 assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "World");
1264 }
1265
1266 #[test]
1267 fn test_zigzag_i64() {
1268 assert_eq!(zigzag_encode_i64(0), 0);
1269 assert_eq!(zigzag_encode_i64(1), 2);
1270 assert_eq!(zigzag_encode_i64(-1), 1);
1271 assert_eq!(zigzag_encode_i64(2), 4);
1272 assert_eq!(zigzag_encode_i64(-2), 3);
1273
1274 assert_eq!(zigzag_decode_i64(0), 0);
1275 assert_eq!(zigzag_decode_i64(2), 1);
1276 assert_eq!(zigzag_decode_i64(1), -1);
1277 assert_eq!(zigzag_decode_i64(4), 2);
1278 assert_eq!(zigzag_decode_i64(3), -2);
1279 }
1280
1281 #[test]
1282 fn test_zigzag_predictive() {
1283 assert_eq!(zigzag_encode(100, 100), 0); assert_eq!(zigzag_encode(101, 100), 2); assert_eq!(zigzag_encode(99, 100), 1); assert_eq!(zigzag_encode(200, 100), 200); assert_eq!(zigzag_decode(0, 100), 100);
1290 assert_eq!(zigzag_decode(2, 100), 101);
1291 assert_eq!(zigzag_decode(1, 100), 99);
1292 assert_eq!(zigzag_decode(200, 100), 200);
1293 }
1294
1295 #[test]
1296 fn test_segment_desc() {
1297 let seg = SegmentDesc::new(42, 7, true, 1000);
1298 assert_eq!(seg.group_id, 42);
1299 assert_eq!(seg.in_group_id, 7);
1300 assert!(seg.is_rev_comp);
1301 assert_eq!(seg.raw_length, 1000);
1302 }
1303
1304 #[test]
1305 fn test_collection_register() {
1306 let mut coll = CollectionV3::new();
1307
1308 assert!(coll.register_sample_contig("sample1", "contig1").unwrap());
1309 assert!(coll.register_sample_contig("sample1", "contig2").unwrap());
1310 assert!(coll.register_sample_contig("sample2", "contig1").unwrap());
1311
1312 assert_eq!(coll.get_no_samples(), 2);
1313
1314 let samples = coll.get_samples_list(false);
1315 assert_eq!(samples, vec!["sample1", "sample2"]);
1316 }
1317
1318 #[test]
1322 fn test_in_group_id_delta_encoding_roundtrip() {
1323 let mut coll = CollectionV3::new();
1324 coll.set_config(60000, 31, None);
1325
1326 coll.register_sample_contig("sample1", "contig1").unwrap();
1328 coll.register_sample_contig("sample1", "contig2").unwrap();
1329 coll.register_sample_contig("sample2", "contig1").unwrap();
1330
1331 let test_segments = vec![
1334 ("sample1", "contig1", 0, 93, 0, false, 61000), ("sample1", "contig1", 1, 93, 1, false, 61000), ("sample1", "contig2", 0, 93, 2, false, 61000), ("sample1", "contig2", 1, 93, 3, false, 61000), ("sample2", "contig1", 0, 93, 4, false, 61000), ("sample2", "contig1", 1, 93, 5, false, 61000), ("sample2", "contig1", 2, 93, 6, false, 61000), ];
1342
1343 for (sample, contig, place, group_id, in_group_id, is_rev_comp, raw_len) in &test_segments {
1344 coll.add_segment_placed(
1345 sample,
1346 contig,
1347 *place,
1348 *group_id,
1349 *in_group_id,
1350 *is_rev_comp,
1351 *raw_len,
1352 )
1353 .unwrap();
1354 }
1355
1356 let serialized = coll.serialize_contig_details(0, 2);
1358
1359 let mut coll2 = CollectionV3::new();
1361 coll2.set_config(60000, 31, None);
1362
1363 coll2.register_sample_contig("sample1", "contig1").unwrap();
1365 coll2.register_sample_contig("sample1", "contig2").unwrap();
1366 coll2.register_sample_contig("sample2", "contig1").unwrap();
1367
1368 coll2.deserialize_contig_details(&serialized, 0).unwrap();
1370
1371 let sample1_contig1 = coll2.get_contig_desc("sample1", "contig1").unwrap();
1373 assert_eq!(
1374 sample1_contig1.len(),
1375 2,
1376 "sample1/contig1 should have 2 segments"
1377 );
1378 assert_eq!(
1379 sample1_contig1[0].in_group_id, 0,
1380 "sample1/contig1 segment 0 should have in_group_id=0"
1381 );
1382 assert_eq!(
1383 sample1_contig1[1].in_group_id, 1,
1384 "sample1/contig1 segment 1 should have in_group_id=1"
1385 );
1386
1387 let sample1_contig2 = coll2.get_contig_desc("sample1", "contig2").unwrap();
1388 assert_eq!(
1389 sample1_contig2.len(),
1390 2,
1391 "sample1/contig2 should have 2 segments"
1392 );
1393 assert_eq!(
1394 sample1_contig2[0].in_group_id, 2,
1395 "sample1/contig2 segment 0 should have in_group_id=2"
1396 );
1397 assert_eq!(
1398 sample1_contig2[1].in_group_id, 3,
1399 "sample1/contig2 segment 1 should have in_group_id=3"
1400 );
1401
1402 let sample2_contig1 = coll2.get_contig_desc("sample2", "contig1").unwrap();
1403 assert_eq!(
1404 sample2_contig1.len(),
1405 3,
1406 "sample2/contig1 should have 3 segments"
1407 );
1408 assert_eq!(
1409 sample2_contig1[0].in_group_id, 4,
1410 "sample2/contig1 segment 0 should have in_group_id=4"
1411 );
1412 assert_eq!(
1413 sample2_contig1[1].in_group_id, 5,
1414 "sample2/contig1 segment 1 should have in_group_id=5"
1415 );
1416 assert_eq!(
1417 sample2_contig1[2].in_group_id, 6,
1418 "sample2/contig1 segment 2 should have in_group_id=6"
1419 );
1420 }
1421}