ragc_common/
collection.rs

1// Collection metadata structures
2// Manages sample names, contig names, and segment descriptors
3
4use crate::Archive;
5use anyhow::{Context, Result};
6use std::collections::HashMap;
7
8/// A segment descriptor identifying a compressed segment
9///
10/// # Architecture
11///
12/// Segments are organized into groups by terminal k-mers (front and back).
13/// Each group has two streams:
14/// - **ref_stream** (xGr): Contains reference segment (in_group_id = 0)
15/// - **delta_stream** (xGd): Contains LZ-encoded segments (in_group_id = 1, 2, 3, ...)
16///
17/// For groups >= 16 (LZ-enabled):
18/// - Reference is stored raw in ref_stream
19/// - Other segments are LZ-encoded against reference
20///
21/// For groups 0-15 (raw-only):
22/// - No LZ encoding, all segments stored raw
23///
24/// # Critical: in_group_id Indexing
25///
26/// - **in_group_id = 0**: ALWAYS the reference segment (in ref_stream)
27/// - **in_group_id >= 1**: Delta segments (in delta_stream)
28///
29/// IMPORTANT: Do NOT store reference in delta_stream! It must be:
30/// 1. Written to ref_stream
31/// 2. Registered with in_group_id = 0
32/// 3. Not included in delta pack
33#[derive(Debug, Clone, Copy, PartialEq)]
34pub struct SegmentDesc {
35    pub group_id: u32,
36    pub in_group_id: u32, // 0 = reference, 1+ = deltas
37    pub is_rev_comp: bool,
38    pub raw_length: u32,
39}
40
41impl SegmentDesc {
42    pub fn new(group_id: u32, in_group_id: u32, is_rev_comp: bool, raw_length: u32) -> Self {
43        SegmentDesc {
44            group_id,
45            in_group_id,
46            is_rev_comp,
47            raw_length,
48        }
49    }
50
51    pub fn empty() -> Self {
52        SegmentDesc {
53            group_id: u32::MAX,
54            in_group_id: u32::MAX,
55            is_rev_comp: false,
56            raw_length: 0,
57        }
58    }
59}
60
61/// A contig with its name and segments
62#[derive(Debug, Clone)]
63pub struct ContigDesc {
64    pub name: String,
65    pub segments: Vec<SegmentDesc>,
66}
67
68impl ContigDesc {
69    pub fn new(name: String) -> Self {
70        ContigDesc {
71            name,
72            segments: Vec::new(),
73        }
74    }
75}
76
77/// A sample with its name and contigs
78#[derive(Debug, Clone)]
79pub struct SampleDesc {
80    pub name: String,
81    pub contigs: Vec<ContigDesc>,
82}
83
84impl SampleDesc {
85    pub fn new(name: String) -> Self {
86        SampleDesc {
87            name,
88            contigs: Vec::new(),
89        }
90    }
91}
92
93/// Custom variable-length integer encoding for Collection metadata
94/// Different from standard varint - uses prefix bits to indicate length
95///
96/// Format:
97/// - 0-127:         1 byte  (0xxxxxxx)
98/// - 128-16511:     2 bytes (10xxxxxx xxxxxxxx)
99/// - 16512-2113663: 3 bytes (110xxxxx xxxxxxxx xxxxxxxx)
100/// - etc.
101pub struct CollectionVarInt;
102
103impl CollectionVarInt {
104    const THR_1: u32 = 1 << 7;
105    const THR_2: u32 = Self::THR_1 + (1 << 14);
106    const THR_3: u32 = Self::THR_2 + (1 << 21);
107    const THR_4: u32 = Self::THR_3 + (1 << 28);
108
109    const PREF_1: u8 = 0;
110    const PREF_2: u8 = 0b10000000u8;
111    const PREF_3: u8 = 0b11000000u8;
112    const PREF_4: u8 = 0b11100000u8;
113    const PREF_5: u8 = 0b11110000u8;
114
115    const MASK_1: u8 = 0b10000000u8;
116    const MASK_2: u8 = 0b11000000u8;
117    const MASK_3: u8 = 0b11100000u8;
118    const MASK_4: u8 = 0b11110000u8;
119
120    pub fn encode(data: &mut Vec<u8>, num: u32) {
121        if num < Self::THR_1 {
122            data.push(Self::PREF_1 + num as u8);
123        } else if num < Self::THR_2 {
124            let num = num - Self::THR_1;
125            data.push(Self::PREF_2 + (num >> 8) as u8);
126            data.push((num & 0xff) as u8);
127        } else if num < Self::THR_3 {
128            let num = num - Self::THR_2;
129            data.push(Self::PREF_3 + (num >> 16) as u8);
130            data.push(((num >> 8) & 0xff) as u8);
131            data.push((num & 0xff) as u8);
132        } else if num < Self::THR_4 {
133            let num = num - Self::THR_3;
134            data.push(Self::PREF_4 + (num >> 24) as u8);
135            data.push(((num >> 16) & 0xff) as u8);
136            data.push(((num >> 8) & 0xff) as u8);
137            data.push((num & 0xff) as u8);
138        } else {
139            let num = num - Self::THR_4;
140            data.push(Self::PREF_5);
141            data.push(((num >> 24) & 0xff) as u8);
142            data.push(((num >> 16) & 0xff) as u8);
143            data.push(((num >> 8) & 0xff) as u8);
144            data.push((num & 0xff) as u8);
145        }
146    }
147
148    pub fn decode(ptr: &mut &[u8]) -> Result<u32> {
149        if ptr.is_empty() {
150            anyhow::bail!("Unexpected end of data while decoding varint");
151        }
152
153        let first = ptr[0];
154
155        if (first & Self::MASK_1) == Self::PREF_1 {
156            let num = (first - Self::PREF_1) as u32;
157            *ptr = &ptr[1..];
158            Ok(num)
159        } else if (first & Self::MASK_2) == Self::PREF_2 {
160            if ptr.len() < 2 {
161                anyhow::bail!("Unexpected end of data while decoding 2-byte varint");
162            }
163            let num =
164                ((ptr[0] as u32) << 8) + ptr[1] as u32 + Self::THR_1 - ((Self::PREF_2 as u32) << 8);
165            *ptr = &ptr[2..];
166            Ok(num)
167        } else if (first & Self::MASK_3) == Self::PREF_3 {
168            if ptr.len() < 3 {
169                anyhow::bail!("Unexpected end of data while decoding 3-byte varint");
170            }
171            let num =
172                ((ptr[0] as u32) << 16) + ((ptr[1] as u32) << 8) + ptr[2] as u32 + Self::THR_2
173                    - ((Self::PREF_3 as u32) << 16);
174            *ptr = &ptr[3..];
175            Ok(num)
176        } else if (first & Self::MASK_4) == Self::PREF_4 {
177            if ptr.len() < 4 {
178                anyhow::bail!("Unexpected end of data while decoding 4-byte varint");
179            }
180            let num = ((ptr[0] as u32) << 24)
181                + ((ptr[1] as u32) << 16)
182                + ((ptr[2] as u32) << 8)
183                + ptr[3] as u32
184                + Self::THR_3
185                - ((Self::PREF_4 as u32) << 24);
186            *ptr = &ptr[4..];
187            Ok(num)
188        } else {
189            // PREF_5
190            if ptr.len() < 5 {
191                anyhow::bail!("Unexpected end of data while decoding 5-byte varint");
192            }
193            let mut num = ptr[1] as u32;
194            num <<= 8;
195            num += ptr[2] as u32;
196            num <<= 8;
197            num += ptr[3] as u32;
198            num <<= 8;
199            num += ptr[4] as u32;
200            num += Self::THR_4;
201            *ptr = &ptr[5..];
202            Ok(num)
203        }
204    }
205
206    pub fn encode_string(data: &mut Vec<u8>, s: &str) {
207        data.extend_from_slice(s.as_bytes());
208        data.push(0);
209    }
210
211    pub fn decode_string(ptr: &mut &[u8]) -> Result<String> {
212        let end = ptr
213            .iter()
214            .position(|&b| b == 0)
215            .context("Null terminator not found in string")?;
216
217        let s = String::from_utf8(ptr[..end].to_vec()).context("Invalid UTF-8 in string")?;
218
219        *ptr = &ptr[end + 1..];
220        Ok(s)
221    }
222}
223
224/// Zigzag encoding for signed differences
225pub fn zigzag_encode_i64(x: i64) -> u64 {
226    if x >= 0 {
227        (2 * x) as u64
228    } else {
229        (2 * (-x) - 1) as u64
230    }
231}
232
233pub fn zigzag_decode_i64(x: u64) -> i64 {
234    if x & 1 != 0 {
235        -(x.div_ceil(2) as i64)
236    } else {
237        (x / 2) as i64
238    }
239}
240
241/// Zigzag encoding with prediction
242pub fn zigzag_encode(x_curr: u64, x_prev: u64) -> u64 {
243    if x_curr < x_prev {
244        2 * (x_prev - x_curr) - 1
245    } else if x_curr < 2 * x_prev {
246        2 * (x_curr - x_prev)
247    } else {
248        x_curr
249    }
250}
251
252pub fn zigzag_decode(x_val: u64, x_prev: u64) -> u64 {
253    if x_val >= 2 * x_prev {
254        x_val
255    } else if x_val & 1 != 0 {
256        (2 * x_prev - x_val) / 2
257    } else {
258        (x_val + 2 * x_prev) / 2
259    }
260}
261
262/// Collection V3 - manages metadata for AGC archives
263pub struct CollectionV3 {
264    sample_desc: Vec<SampleDesc>,
265    sample_ids: HashMap<String, usize>,
266
267    // Archive stream IDs
268    collection_samples_id: Option<usize>,
269    collection_contigs_id: Option<usize>,
270    collection_details_id: Option<usize>,
271
272    // Configuration
273    batch_size: usize,
274    segment_size: u32,
275    kmer_length: u32,
276
277    // State tracking
278    prev_sample_name: String,
279    #[allow(dead_code)]
280    placing_sample_name: String,
281    #[allow(dead_code)]
282    placing_sample_id: usize,
283    no_samples_in_last_batch: usize,
284
285    // For in_group_id delta encoding
286    in_group_ids: Vec<i32>,
287}
288
289impl Default for CollectionV3 {
290    fn default() -> Self {
291        Self::new()
292    }
293}
294
295impl CollectionV3 {
296    pub fn new() -> Self {
297        CollectionV3 {
298            sample_desc: Vec::new(),
299            sample_ids: HashMap::new(),
300            collection_samples_id: None,
301            collection_contigs_id: None,
302            collection_details_id: None,
303            batch_size: 1 << 20, // 1MB batches
304            segment_size: 0,
305            kmer_length: 0,
306            prev_sample_name: String::new(),
307            placing_sample_name: String::new(),
308            placing_sample_id: 0,
309            no_samples_in_last_batch: 0,
310            in_group_ids: Vec::new(),
311        }
312    }
313
314    pub fn set_config(&mut self, segment_size: u32, kmer_length: u32, batch_size: Option<usize>) {
315        self.segment_size = segment_size;
316        self.kmer_length = kmer_length;
317        if let Some(bs) = batch_size {
318            self.batch_size = bs;
319        }
320    }
321
322    pub fn prepare_for_compression(&mut self, archive: &mut Archive) -> Result<()> {
323        self.collection_samples_id = Some(archive.register_stream("collection-samples"));
324        self.collection_contigs_id = Some(archive.register_stream("collection-contigs"));
325        self.collection_details_id = Some(archive.register_stream("collection-details"));
326        Ok(())
327    }
328
329    pub fn prepare_for_decompression(&mut self, archive: &Archive) -> Result<()> {
330        self.collection_samples_id = archive.get_stream_id("collection-samples");
331        self.collection_contigs_id = archive.get_stream_id("collection-contigs");
332        self.collection_details_id = archive.get_stream_id("collection-details");
333
334        if self.collection_samples_id.is_none() {
335            anyhow::bail!("collection-samples stream not found in archive");
336        }
337        if self.collection_contigs_id.is_none() {
338            anyhow::bail!("collection-contigs stream not found in archive");
339        }
340        if self.collection_details_id.is_none() {
341            anyhow::bail!("collection-details stream not found in archive");
342        }
343
344        Ok(())
345    }
346
347    /// Register a sample and contig (idempotent - adds contig to existing sample if needed)
348    pub fn register_sample_contig(&mut self, sample_name: &str, contig_name: &str) -> Result<bool> {
349        let mut stored_sample_name = sample_name.to_string();
350
351        if sample_name.is_empty() {
352            stored_sample_name = Self::extract_contig_name(contig_name);
353        }
354
355        // Get or create sample
356        let sample_id = if let Some(&id) = self.sample_ids.get(&stored_sample_name) {
357            // Sample already exists
358            id
359        } else {
360            // New sample
361            let id = self.sample_ids.len();
362            self.sample_ids.insert(stored_sample_name.clone(), id);
363            self.sample_desc
364                .push(SampleDesc::new(stored_sample_name.clone()));
365            id
366        };
367
368        self.prev_sample_name = stored_sample_name;
369
370        // Add contig to the sample (avoid duplicates)
371        let sample = &mut self.sample_desc[sample_id];
372        if !sample.contigs.iter().any(|c| c.name == contig_name) {
373            sample
374                .contigs
375                .push(ContigDesc::new(contig_name.to_string()));
376            Ok(true)
377        } else {
378            Ok(false) // Contig already registered
379        }
380    }
381
382    /// Add segment placement information
383    #[allow(clippy::too_many_arguments)]
384    pub fn add_segment_placed(
385        &mut self,
386        sample_name: &str,
387        contig_name: &str,
388        place: usize,
389        group_id: u32,
390        in_group_id: u32,
391        is_rev_comp: bool,
392        raw_length: u32,
393    ) -> Result<()> {
394        let mut stored_sample_name = sample_name.to_string();
395
396        if sample_name.is_empty() {
397            stored_sample_name = Self::extract_contig_name(contig_name);
398        }
399
400        // BUGFIX: Always lookup sample_id fresh instead of caching
401        // The cache was causing segments to be registered to the wrong sample
402        // when GroupWriters buffer segments across sample boundaries
403        let sample_id = *self
404            .sample_ids
405            .get(&stored_sample_name)
406            .context(format!("Sample not found: {stored_sample_name}"))?;
407
408        if std::env::var("RAGC_DEBUG_REGISTER").is_ok() {
409            eprintln!("REGISTER: sample='{stored_sample_name}' (id={sample_id}), contig='{contig_name}', place={place}");
410        }
411
412        let sample = &mut self.sample_desc[sample_id];
413        for contig in &mut sample.contigs {
414            if contig.name == contig_name {
415                if place >= contig.segments.len() {
416                    contig.segments.resize(place + 1, SegmentDesc::empty());
417                }
418                contig.segments[place] =
419                    SegmentDesc::new(group_id, in_group_id, is_rev_comp, raw_length);
420                return Ok(());
421            }
422        }
423
424        anyhow::bail!("Contig {contig_name} not found in sample {stored_sample_name}");
425    }
426
427    /// Get list of samples
428    pub fn get_samples_list(&self, sorted: bool) -> Vec<String> {
429        let mut samples: Vec<String> = self.sample_desc.iter().map(|s| s.name.clone()).collect();
430        if sorted {
431            samples.sort();
432        }
433        samples
434    }
435
436    /// Get number of samples
437    pub fn get_no_samples(&self) -> usize {
438        self.sample_desc.len()
439    }
440
441    /// Get number of contigs in a sample
442    pub fn get_no_contigs(&self, sample_name: &str) -> Option<usize> {
443        self.sample_ids
444            .get(sample_name)
445            .map(|&id| self.sample_desc[id].contigs.len())
446    }
447
448    /// Get contig list for a sample
449    pub fn get_contig_list(&self, sample_name: &str) -> Option<Vec<String>> {
450        self.sample_ids.get(sample_name).map(|&id| {
451            self.sample_desc[id]
452                .contigs
453                .iter()
454                .map(|c| c.name.clone())
455                .collect()
456        })
457    }
458
459    /// Get sample descriptor with all contigs and segments
460    pub fn get_sample_desc(&self, sample_name: &str) -> Option<Vec<(String, Vec<SegmentDesc>)>> {
461        self.sample_ids.get(sample_name).map(|&id| {
462            self.sample_desc[id]
463                .contigs
464                .iter()
465                .map(|c| (c.name.clone(), c.segments.clone()))
466                .collect()
467        })
468    }
469
470    /// Get contig descriptor for a specific contig in a sample
471    pub fn get_contig_desc(
472        &self,
473        sample_name: &str,
474        contig_name: &str,
475    ) -> Option<Vec<SegmentDesc>> {
476        self.sample_ids.get(sample_name).and_then(|&id| {
477            self.sample_desc[id]
478                .contigs
479                .iter()
480                .find(|c| c.name == contig_name)
481                .map(|c| c.segments.clone())
482        })
483    }
484
485    /// Get configuration parameters (segment_size, kmer_length)
486    pub fn get_params(&self) -> (u32, u32) {
487        (self.segment_size, self.kmer_length)
488    }
489
490    /// Extract short contig name (first word)
491    fn extract_contig_name(s: &str) -> String {
492        s.split_whitespace().next().unwrap_or(s).to_string()
493    }
494
495    // Serialization/Deserialization methods
496
497    /// Serialize sample names
498    fn serialize_sample_names(&self) -> Vec<u8> {
499        let mut data = Vec::new();
500        CollectionVarInt::encode(&mut data, self.sample_desc.len() as u32);
501
502        for sample in &self.sample_desc {
503            CollectionVarInt::encode_string(&mut data, &sample.name);
504        }
505
506        data
507    }
508
509    /// Deserialize sample names
510    fn deserialize_sample_names(&mut self, data: &[u8]) -> Result<()> {
511        let mut ptr = data;
512
513        let no_samples = CollectionVarInt::decode(&mut ptr)?;
514
515        self.sample_desc.clear();
516        self.sample_ids.clear();
517
518        for i in 0..no_samples {
519            let name = CollectionVarInt::decode_string(&mut ptr)?;
520            self.sample_ids.insert(name.clone(), i as usize);
521            self.sample_desc.push(SampleDesc::new(name));
522        }
523
524        Ok(())
525    }
526
527    /// Split a string by spaces
528    fn split_string(s: &str) -> Vec<String> {
529        s.split(' ').map(|s| s.to_string()).collect()
530    }
531
532    /// Encode contig name using delta from previous name (returns raw bytes)
533    fn encode_split(prev_split: &[String], curr_split: &[String]) -> Vec<u8> {
534        let mut enc = Vec::new();
535
536        for i in 0..curr_split.len() {
537            if prev_split[i] == curr_split[i] {
538                enc.push((-127i8) as u8); // Same component marker (0x81 = 129)
539            } else if prev_split[i].len() != curr_split[i].len() {
540                enc.extend_from_slice(curr_split[i].as_bytes());
541            } else {
542                // Encode with run-length for identical characters
543                let mut cnt: i8 = 0;
544                let p_bytes = prev_split[i].as_bytes();
545                let c_bytes = curr_split[i].as_bytes();
546
547                for j in 0..c_bytes.len() {
548                    if p_bytes[j] == c_bytes[j] {
549                        if cnt == 100 {
550                            enc.push((-cnt) as u8); // Repetition marker
551                            cnt = 1;
552                        } else {
553                            cnt += 1;
554                        }
555                    } else {
556                        if cnt > 0 {
557                            enc.push((-cnt) as u8); // Repetition marker
558                            cnt = 0;
559                        }
560                        enc.push(c_bytes[j]);
561                    }
562                }
563
564                if cnt > 0 {
565                    enc.push((-cnt) as u8); // Repetition marker
566                }
567            }
568
569            enc.push(b' ');
570        }
571
572        // Remove final space
573        if enc.last() == Some(&b' ') {
574            enc.pop();
575        }
576
577        enc
578    }
579
580    /// Decode contig name using delta from previous name (from raw bytes)
581    #[allow(clippy::ptr_arg)]
582    fn decode_split(prev_split: &mut Vec<String>, curr_split_bytes: &[u8]) -> String {
583        // Split by space
584        let parts: Vec<&[u8]> = curr_split_bytes.split(|&b| b == b' ').collect();
585        let mut dec_parts = Vec::new();
586
587        for i in 0..parts.len() {
588            if parts[i].len() == 1 && (parts[i][0] as i8) == -127 {
589                // Same component marker
590                dec_parts.push(prev_split[i].clone());
591            } else {
592                let mut cmp = Vec::new();
593                let p_bytes = prev_split[i].as_bytes();
594                let mut p_idx = 0;
595
596                for &byte in parts[i] {
597                    let c = byte as i8;
598                    if c >= 0 {
599                        cmp.push(byte);
600                        p_idx += 1;
601                    } else {
602                        // Repetition count
603                        let count = -c as usize;
604                        cmp.extend_from_slice(&p_bytes[p_idx..p_idx + count]);
605                        p_idx += count;
606                    }
607                }
608
609                let component = String::from_utf8(cmp).expect("Invalid UTF-8 in component");
610                prev_split[i] = component.clone();
611                dec_parts.push(component);
612            }
613        }
614
615        dec_parts.join(" ")
616    }
617
618    /// Serialize contig names for a batch of samples
619    fn serialize_contig_names(&self, id_from: usize, id_to: usize) -> Vec<u8> {
620        let mut data = Vec::new();
621
622        CollectionVarInt::encode(&mut data, (id_to - id_from) as u32);
623
624        if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
625            eprintln!("SERIALIZE_CONTIG_NAMES: samples {id_from}..{id_to}");
626        }
627
628        for (sample_idx, sample) in self.sample_desc[id_from..id_to].iter().enumerate() {
629            CollectionVarInt::encode(&mut data, sample.contigs.len() as u32);
630
631            if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
632                eprintln!(
633                    "  Sample {} ({}): {} contigs",
634                    id_from + sample_idx,
635                    sample.name,
636                    sample.contigs.len()
637                );
638            }
639
640            let mut prev_split = Vec::new();
641
642            for (contig_idx, contig) in sample.contigs.iter().enumerate() {
643                let curr_split = Self::split_string(&contig.name);
644                let before_len = data.len();
645
646                if curr_split.len() != prev_split.len() {
647                    CollectionVarInt::encode_string(&mut data, &contig.name);
648
649                    if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
650                        eprintln!(
651                            "    Contig {}: '{}' (FULL) -> {} bytes",
652                            contig_idx,
653                            contig.name,
654                            data.len() - before_len
655                        );
656                    }
657                } else {
658                    let enc_bytes = Self::encode_split(&prev_split, &curr_split);
659                    // Write as raw bytes with null terminator
660                    data.extend_from_slice(&enc_bytes);
661                    data.push(0);
662
663                    if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
664                        eprintln!("    Contig {}: '{}' (DELTA prev='{}') -> enc_bytes={:?}, {} bytes total",
665                            contig_idx, contig.name, prev_split.join(" "), enc_bytes, data.len() - before_len);
666                    }
667                }
668
669                prev_split = curr_split;
670            }
671        }
672
673        if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
674            eprintln!("  Total serialized size: {} bytes", data.len());
675        }
676
677        data
678    }
679
680    /// Decode raw bytes string (handles non-UTF8 bytes)
681    fn decode_bytes_string(ptr: &mut &[u8]) -> Result<Vec<u8>> {
682        let end = ptr
683            .iter()
684            .position(|&b| b == 0)
685            .context("Null terminator not found in bytes string")?;
686
687        let bytes = ptr[..end].to_vec();
688        *ptr = &ptr[end + 1..];
689        Ok(bytes)
690    }
691
692    /// Deserialize contig names for a batch of samples
693    fn deserialize_contig_names(&mut self, data: &[u8], i_sample: usize) -> Result<()> {
694        let mut ptr = data;
695
696        let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr)? as usize;
697
698        for i in 0..no_samples_in_curr_batch {
699            let no_contigs = CollectionVarInt::decode(&mut ptr)? as usize;
700
701            let curr_sample = &mut self.sample_desc[i_sample + i];
702            curr_sample.contigs.clear();
703            curr_sample.contigs.reserve(no_contigs);
704
705            let mut prev_split = Vec::new();
706
707            for _ in 0..no_contigs {
708                let enc_bytes = Self::decode_bytes_string(&mut ptr)?;
709
710                // Try to decode as UTF-8 first
711                let name = if let Ok(enc_str) = std::str::from_utf8(&enc_bytes) {
712                    let curr_split = Self::split_string(enc_str);
713
714                    if prev_split.is_empty() || curr_split.len() != prev_split.len() {
715                        // Plain text (first contig or different structure)
716                        prev_split = curr_split;
717                        enc_str.to_string()
718                    } else {
719                        // Encoded with delta
720                        let decoded = Self::decode_split(&mut prev_split, &enc_bytes);
721                        prev_split = Self::split_string(&decoded);
722                        decoded
723                    }
724                } else {
725                    // Contains non-UTF8 bytes, must be encoded (should not happen if prev_split is empty)
726                    if prev_split.is_empty() {
727                        anyhow::bail!("Cannot decode non-UTF8 bytes without previous split");
728                    }
729                    let decoded = Self::decode_split(&mut prev_split, &enc_bytes);
730                    prev_split = Self::split_string(&decoded);
731                    decoded
732                };
733
734                curr_sample.contigs.push(ContigDesc::new(name));
735            }
736        }
737
738        self.no_samples_in_last_batch = no_samples_in_curr_batch;
739
740        Ok(())
741    }
742
743    /// Clear in_group_ids tracking
744    fn clear_in_group_ids(&mut self) {
745        self.in_group_ids.clear();
746    }
747
748    /// Get in_group_id for a group
749    fn get_in_group_id(&self, pos: usize) -> i32 {
750        self.in_group_ids.get(pos).copied().unwrap_or(-1)
751    }
752
753    /// Set in_group_id for a group
754    fn set_in_group_id(&mut self, pos: usize, val: i32) {
755        if pos >= self.in_group_ids.len() {
756            self.in_group_ids
757                .resize((pos as f64 * 1.2) as usize + 1, -1);
758        }
759        self.in_group_ids[pos] = val;
760    }
761
762    /// Serialize contig details (5-stream format)
763    fn serialize_contig_details(&mut self, id_from: usize, id_to: usize) -> [Vec<u8>; 5] {
764        let mut v_data: [Vec<u8>; 5] = Default::default();
765
766        CollectionVarInt::encode(&mut v_data[0], (id_to - id_from) as u32);
767
768        self.clear_in_group_ids();
769
770        let pred_raw_length = self.segment_size + self.kmer_length;
771
772        // First pass: compute encoded values
773        // IMPORTANT: Update in_group_ids IMMEDIATELY after encoding each segment (matching C++ AGC)
774        let mut encoded_segments = Vec::new();
775
776        for sample_idx in id_from..id_to {
777            let mut sample_encoded = Vec::new();
778
779            for contig_idx in 0..self.sample_desc[sample_idx].contigs.len() {
780                let mut contig_encoded = Vec::new();
781
782                for seg_idx in 0..self.sample_desc[sample_idx].contigs[contig_idx]
783                    .segments
784                    .len()
785                {
786                    let seg = &self.sample_desc[sample_idx].contigs[contig_idx].segments[seg_idx];
787                    let prev_in_group_id = self.get_in_group_id(seg.group_id as usize);
788
789                    let e_group_id = seg.group_id;
790                    let e_in_group_id = if prev_in_group_id == -1 {
791                        seg.in_group_id
792                    } else if seg.in_group_id == 0 {
793                        0
794                    } else if seg.in_group_id as i32 == prev_in_group_id + 1 {
795                        1
796                    } else {
797                        zigzag_encode(seg.in_group_id as u64, (prev_in_group_id + 1) as u64) as u32
798                            + 1
799                    };
800
801                    let e_raw_length =
802                        zigzag_encode(seg.raw_length as u64, pred_raw_length as u64) as u32;
803
804                    // DEBUG: Print encoded values during serialization
805                    if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
806                        eprintln!(
807                            "ENCODE: group={}, in_group_id={}, prev={}, e_in_group_id={}",
808                            seg.group_id, seg.in_group_id, prev_in_group_id, e_in_group_id
809                        );
810                    }
811
812                    contig_encoded.push((e_group_id, e_in_group_id, e_raw_length, seg.is_rev_comp));
813
814                    // Apply update IMMEDIATELY (matching C++ AGC behavior at collection_v3.cpp:581-582)
815                    // NOTE: C++ AGC condition is: c_in_group_id > prev_in_group_id && c_in_group_id > 0
816                    // We must match this EXACTLY even though it seems wrong!
817                    if seg.in_group_id as i32 > prev_in_group_id && seg.in_group_id > 0 {
818                        self.set_in_group_id(seg.group_id as usize, seg.in_group_id as i32);
819                    }
820                }
821
822                sample_encoded.push(contig_encoded);
823            }
824
825            encoded_segments.push(sample_encoded);
826        }
827
828        // Second pass: write to output streams
829        if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
830            eprintln!(
831                "SERIALIZE: Writing {} samples to collection",
832                encoded_segments.len()
833            );
834        }
835
836        for (sample_idx, sample_encoded) in encoded_segments.iter().enumerate() {
837            let before_len = v_data[0].len();
838            CollectionVarInt::encode(&mut v_data[0], sample_encoded.len() as u32);
839            let after_len = v_data[0].len();
840
841            if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
842                eprintln!("  Sample {}: {} contigs", sample_idx, sample_encoded.len());
843                eprintln!(
844                    "    Wrote bytes [{}..{}]: {:?}",
845                    before_len,
846                    after_len,
847                    &v_data[0][before_len..after_len]
848                );
849            }
850
851            for (contig_idx, contig_encoded) in sample_encoded.iter().enumerate() {
852                let before_len = v_data[0].len();
853                CollectionVarInt::encode(&mut v_data[0], contig_encoded.len() as u32);
854                let after_len = v_data[0].len();
855
856                if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
857                    eprintln!(
858                        "    Contig {}: {} segments",
859                        contig_idx,
860                        contig_encoded.len()
861                    );
862                    eprintln!(
863                        "      Wrote bytes [{}..{}]: {:?}",
864                        before_len,
865                        after_len,
866                        &v_data[0][before_len..after_len]
867                    );
868                }
869
870                for &(e_group_id, e_in_group_id, e_raw_length, is_rev_comp) in contig_encoded {
871                    CollectionVarInt::encode(&mut v_data[1], e_group_id);
872                    CollectionVarInt::encode(&mut v_data[2], e_in_group_id);
873                    CollectionVarInt::encode(&mut v_data[3], e_raw_length);
874                    CollectionVarInt::encode(&mut v_data[4], is_rev_comp as u32);
875                }
876            }
877
878            if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
879                let total_segments: usize = sample_encoded.iter().map(|c| c.len()).sum();
880                eprintln!("  Sample {sample_idx} total: {total_segments} segments");
881            }
882        }
883
884        if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
885            eprintln!(
886                "Stream sizes: [0]={}, [1]={}, [2]={}, [3]={}, [4]={}",
887                v_data[0].len(),
888                v_data[1].len(),
889                v_data[2].len(),
890                v_data[3].len(),
891                v_data[4].len()
892            );
893        }
894
895        v_data
896    }
897
898    /// Deserialize contig details (5-stream format)
899    fn deserialize_contig_details(&mut self, v_data: &[Vec<u8>; 5], i_sample: usize) -> Result<()> {
900        let mut ptr0 = v_data[0].as_slice();
901
902        let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr0)? as usize;
903
904        if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
905            eprintln!("DESERIALIZE: Reading {no_samples_in_curr_batch} samples from collection");
906        }
907
908        // First pass: decode structure and counts
909        let mut structure = Vec::new();
910        for sample_idx in 0..no_samples_in_curr_batch {
911            let no_contigs = CollectionVarInt::decode(&mut ptr0)? as usize;
912            let mut contig_seg_counts = Vec::new();
913
914            if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
915                eprintln!("  Sample {sample_idx}: {no_contigs} contigs");
916            }
917
918            for contig_idx in 0..no_contigs {
919                let no_segments = CollectionVarInt::decode(&mut ptr0)? as usize;
920                contig_seg_counts.push(no_segments);
921
922                if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
923                    eprintln!("    Contig {contig_idx}: {no_segments} segments");
924                }
925            }
926
927            structure.push(contig_seg_counts);
928        }
929
930        // Decode detail streams
931        let mut v_det: [Vec<u32>; 5] = Default::default();
932        let mut no_items = 0;
933
934        for counts in &structure {
935            for &count in counts {
936                no_items += count;
937            }
938        }
939
940        for i in 1..5 {
941            v_det[i].reserve(no_items);
942            let mut ptr = v_data[i].as_slice();
943
944            for _ in 0..no_items {
945                v_det[i].push(CollectionVarInt::decode(&mut ptr)?);
946            }
947        }
948
949        // Reconstruct segments - first pass: decode all values
950        self.clear_in_group_ids();
951
952        let pred_raw_length = self.segment_size + self.kmer_length;
953        let mut item_idx = 0;
954
955        // Collect all decoded segments first
956        // IMPORTANT: Update in_group_ids IMMEDIATELY after decoding each segment (matching C++ AGC)
957        let mut decoded_segments = Vec::new();
958
959        for contig_seg_counts in &structure {
960            let mut sample_segs = Vec::new();
961
962            for &no_segments in contig_seg_counts {
963                let mut contig_segs = Vec::new();
964
965                for _ in 0..no_segments {
966                    let c_group_id = v_det[1][item_idx];
967                    let prev_in_group_id = self.get_in_group_id(c_group_id as usize);
968
969                    let e_in_group_id = v_det[2][item_idx];
970                    let c_in_group_id = if prev_in_group_id == -1 {
971                        e_in_group_id
972                    } else if e_in_group_id == 0 {
973                        0
974                    } else if e_in_group_id == 1 {
975                        (prev_in_group_id + 1) as u32
976                    } else {
977                        zigzag_decode(e_in_group_id as u64 - 1, (prev_in_group_id + 1) as u64)
978                            as u32
979                    };
980
981                    // DEBUG: Print encoded vs decoded values for investigation
982                    if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
983                        eprintln!("DECODE: item={item_idx}, group={c_group_id}, e_in_group_id={e_in_group_id}, prev={prev_in_group_id}, c_in_group_id={c_in_group_id}");
984                    }
985
986                    let c_raw_length =
987                        zigzag_decode(v_det[3][item_idx] as u64, pred_raw_length as u64) as u32;
988                    let c_is_rev_comp = v_det[4][item_idx] != 0;
989
990                    contig_segs.push(SegmentDesc::new(
991                        c_group_id,
992                        c_in_group_id,
993                        c_is_rev_comp,
994                        c_raw_length,
995                    ));
996
997                    // Apply update IMMEDIATELY (matching C++ AGC behavior at collection_v3.cpp:674-675)
998                    // NOTE: C++ AGC condition is: c_in_group_id > prev_in_group_id && c_in_group_id > 0
999                    // We must match this EXACTLY even though it causes issues with multiple samples per group!
1000                    if c_in_group_id as i32 > prev_in_group_id && c_in_group_id > 0 {
1001                        self.set_in_group_id(c_group_id as usize, c_in_group_id as i32);
1002                    }
1003
1004                    item_idx += 1;
1005                }
1006
1007                sample_segs.push(contig_segs);
1008            }
1009
1010            decoded_segments.push(sample_segs);
1011        }
1012
1013        // Second pass: assign to sample_desc
1014        for (i, sample_segs) in decoded_segments.into_iter().enumerate() {
1015            let curr_sample = &mut self.sample_desc[i_sample + i];
1016
1017            for (j, contig_segs) in sample_segs.into_iter().enumerate() {
1018                curr_sample.contigs[j].segments = contig_segs;
1019            }
1020        }
1021
1022        Ok(())
1023    }
1024
1025    /// Store sample names batch (with ZSTD compression)
1026    pub fn store_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1027        let v_tmp = self.serialize_sample_names();
1028
1029        let v_data = zstd::encode_all(&v_tmp[..], 19).context("Failed to compress sample names")?;
1030
1031        let stream_id = self
1032            .collection_samples_id
1033            .context("collection-samples stream not registered")?;
1034
1035        archive.add_part(stream_id, &v_data, v_tmp.len() as u64)?;
1036
1037        Ok(())
1038    }
1039
1040    /// Load sample names batch (with ZSTD decompression)
1041    pub fn load_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1042        let stream_id = self
1043            .collection_samples_id
1044            .context("collection-samples stream not found")?;
1045
1046        let (v_tmp, raw_size) = archive
1047            .get_part(stream_id)?
1048            .context("No sample names batch found")?;
1049
1050        let v_data = zstd::decode_all(&v_tmp[..]).context("Failed to decompress sample names")?;
1051
1052        if v_data.len() != raw_size as usize {
1053            anyhow::bail!(
1054                "Decompressed size mismatch: expected {}, got {}",
1055                raw_size,
1056                v_data.len()
1057            );
1058        }
1059
1060        self.deserialize_sample_names(&v_data)?;
1061
1062        Ok(())
1063    }
1064
1065    /// Load a batch of contigs (names + details)
1066    #[allow(clippy::needless_range_loop)]
1067    pub fn load_contig_batch(&mut self, archive: &mut Archive, id_batch: usize) -> Result<()> {
1068        // Load contig names
1069        let contig_stream_id = self
1070            .collection_contigs_id
1071            .context("collection-contigs stream not found")?;
1072
1073        let (v_tmp_names, raw_size_names) = archive.get_part_by_id(contig_stream_id, id_batch)?;
1074        let v_data_names =
1075            zstd::decode_all(&v_tmp_names[..]).context("Failed to decompress contig names")?;
1076
1077        if v_data_names.len() != raw_size_names as usize {
1078            anyhow::bail!(
1079                "Decompressed size mismatch for contig names: expected {}, got {}",
1080                raw_size_names,
1081                v_data_names.len()
1082            );
1083        }
1084
1085        self.deserialize_contig_names(&v_data_names, id_batch * self.batch_size)?;
1086
1087        // Load contig details
1088        let details_stream_id = self
1089            .collection_details_id
1090            .context("collection-details stream not found")?;
1091
1092        let (v_stream, _) = archive.get_part_by_id(details_stream_id, id_batch)?;
1093
1094        // Unpack the 5 streams
1095        let mut ptr = v_stream.as_slice();
1096        let mut sizes: [(u32, u32); 5] = Default::default();
1097
1098        for i in 0..5 {
1099            sizes[i].0 = CollectionVarInt::decode(&mut ptr)?; // raw size
1100            sizes[i].1 = CollectionVarInt::decode(&mut ptr)?; // compressed size
1101        }
1102
1103        let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1104        for i in 0..5 {
1105            v_data_details_compressed[i] = ptr[..sizes[i].1 as usize].to_vec();
1106            ptr = &ptr[sizes[i].1 as usize..];
1107        }
1108
1109        // Decompress each stream
1110        let mut v_data_details: [Vec<u8>; 5] = Default::default();
1111        for i in 0..5 {
1112            v_data_details[i] = zstd::decode_all(&v_data_details_compressed[i][..])
1113                .context(format!("Failed to decompress contig details stream {i}"))?;
1114
1115            if v_data_details[i].len() != sizes[i].0 as usize {
1116                anyhow::bail!(
1117                    "Decompressed size mismatch for details stream {}: expected {}, got {}",
1118                    i,
1119                    sizes[i].0,
1120                    v_data_details[i].len()
1121                );
1122            }
1123        }
1124
1125        self.deserialize_contig_details(&v_data_details, id_batch * self.batch_size)?;
1126
1127        Ok(())
1128    }
1129
1130    /// Store a batch of contigs (names + details)
1131    #[allow(clippy::needless_range_loop)]
1132    pub fn store_contig_batch(
1133        &mut self,
1134        archive: &mut Archive,
1135        id_from: usize,
1136        id_to: usize,
1137    ) -> Result<()> {
1138        // Store contig names
1139        let v_tmp_names = self.serialize_contig_names(id_from, id_to);
1140        let v_data_names =
1141            zstd::encode_all(&v_tmp_names[..], 18).context("Failed to compress contig names")?;
1142
1143        let contig_stream_id = self
1144            .collection_contigs_id
1145            .context("collection-contigs stream not registered")?;
1146
1147        archive.add_part(contig_stream_id, &v_data_names, v_tmp_names.len() as u64)?;
1148
1149        // Store contig details (5 streams)
1150        let v_data_details_raw = self.serialize_contig_details(id_from, id_to);
1151
1152        let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1153        for i in 0..5 {
1154            v_data_details_compressed[i] = zstd::encode_all(&v_data_details_raw[i][..], 19)
1155                .context(format!("Failed to compress contig details stream {i}"))?;
1156        }
1157
1158        // Pack into single stream
1159        let mut v_stream = Vec::new();
1160        for i in 0..5 {
1161            CollectionVarInt::encode(&mut v_stream, v_data_details_raw[i].len() as u32);
1162            CollectionVarInt::encode(&mut v_stream, v_data_details_compressed[i].len() as u32);
1163        }
1164        for i in 0..5 {
1165            v_stream.extend_from_slice(&v_data_details_compressed[i]);
1166        }
1167
1168        let details_stream_id = self
1169            .collection_details_id
1170            .context("collection-details stream not registered")?;
1171
1172        archive.add_part(details_stream_id, &v_stream, 0)?;
1173
1174        // Clear contigs from memory (for large datasets)
1175        for sample in &mut self.sample_desc[id_from..id_to] {
1176            sample.contigs.clear();
1177            sample.contigs.shrink_to_fit();
1178        }
1179
1180        Ok(())
1181    }
1182}
1183
1184#[cfg(test)]
1185mod tests {
1186    use super::*;
1187
1188    #[test]
1189    fn test_collection_varint_1byte() {
1190        let mut data = Vec::new();
1191        CollectionVarInt::encode(&mut data, 0);
1192        CollectionVarInt::encode(&mut data, 42);
1193        CollectionVarInt::encode(&mut data, 127);
1194
1195        let mut ptr = data.as_slice();
1196        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 0);
1197        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 42);
1198        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 127);
1199    }
1200
1201    #[test]
1202    fn test_collection_varint_2byte() {
1203        let mut data = Vec::new();
1204        CollectionVarInt::encode(&mut data, 128);
1205        CollectionVarInt::encode(&mut data, 255);
1206        CollectionVarInt::encode(&mut data, 16511);
1207
1208        let mut ptr = data.as_slice();
1209        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 128);
1210        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 255);
1211        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16511);
1212    }
1213
1214    #[test]
1215    fn test_collection_varint_3byte() {
1216        let mut data = Vec::new();
1217        CollectionVarInt::encode(&mut data, 16512);
1218        CollectionVarInt::encode(&mut data, 65536);
1219        CollectionVarInt::encode(&mut data, 2113663);
1220
1221        let mut ptr = data.as_slice();
1222        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16512);
1223        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 65536);
1224        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 2113663);
1225    }
1226
1227    #[test]
1228    fn test_collection_varint_string() {
1229        let mut data = Vec::new();
1230        CollectionVarInt::encode_string(&mut data, "Hello");
1231        CollectionVarInt::encode_string(&mut data, "World");
1232
1233        let mut ptr = data.as_slice();
1234        assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "Hello");
1235        assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "World");
1236    }
1237
1238    #[test]
1239    fn test_zigzag_i64() {
1240        assert_eq!(zigzag_encode_i64(0), 0);
1241        assert_eq!(zigzag_encode_i64(1), 2);
1242        assert_eq!(zigzag_encode_i64(-1), 1);
1243        assert_eq!(zigzag_encode_i64(2), 4);
1244        assert_eq!(zigzag_encode_i64(-2), 3);
1245
1246        assert_eq!(zigzag_decode_i64(0), 0);
1247        assert_eq!(zigzag_decode_i64(2), 1);
1248        assert_eq!(zigzag_decode_i64(1), -1);
1249        assert_eq!(zigzag_decode_i64(4), 2);
1250        assert_eq!(zigzag_decode_i64(3), -2);
1251    }
1252
1253    #[test]
1254    fn test_zigzag_predictive() {
1255        // Test predictive zigzag encoding
1256        assert_eq!(zigzag_encode(100, 100), 0); // Same as previous
1257        assert_eq!(zigzag_encode(101, 100), 2); // Slightly above
1258        assert_eq!(zigzag_encode(99, 100), 1); // Slightly below
1259        assert_eq!(zigzag_encode(200, 100), 200); // Far away
1260
1261        assert_eq!(zigzag_decode(0, 100), 100);
1262        assert_eq!(zigzag_decode(2, 100), 101);
1263        assert_eq!(zigzag_decode(1, 100), 99);
1264        assert_eq!(zigzag_decode(200, 100), 200);
1265    }
1266
1267    #[test]
1268    fn test_segment_desc() {
1269        let seg = SegmentDesc::new(42, 7, true, 1000);
1270        assert_eq!(seg.group_id, 42);
1271        assert_eq!(seg.in_group_id, 7);
1272        assert!(seg.is_rev_comp);
1273        assert_eq!(seg.raw_length, 1000);
1274    }
1275
1276    #[test]
1277    fn test_collection_register() {
1278        let mut coll = CollectionV3::new();
1279
1280        assert!(coll.register_sample_contig("sample1", "contig1").unwrap());
1281        assert!(coll.register_sample_contig("sample1", "contig2").unwrap());
1282        assert!(coll.register_sample_contig("sample2", "contig1").unwrap());
1283
1284        assert_eq!(coll.get_no_samples(), 2);
1285
1286        let samples = coll.get_samples_list(false);
1287        assert_eq!(samples, vec!["sample1", "sample2"]);
1288    }
1289
1290    /// Test that in_group_id delta encoding/decoding works correctly
1291    /// This specifically tests the bug where deserialize updates in_group_id
1292    /// during the loop instead of after (like serialize does)
1293    #[test]
1294    fn test_in_group_id_delta_encoding_roundtrip() {
1295        let mut coll = CollectionV3::new();
1296        coll.set_config(60000, 31, None);
1297
1298        // Register samples and contigs
1299        coll.register_sample_contig("sample1", "contig1").unwrap();
1300        coll.register_sample_contig("sample1", "contig2").unwrap();
1301        coll.register_sample_contig("sample2", "contig1").unwrap();
1302
1303        // Add multiple segments from the same group (group 93) with consecutive in_group_ids
1304        // This reproduces the real-world scenario where multiple contigs have segments in the same group
1305        let test_segments = vec![
1306            ("sample1", "contig1", 0, 93, 0, false, 61000), // group 93, in_group_id 0
1307            ("sample1", "contig1", 1, 93, 1, false, 61000), // group 93, in_group_id 1
1308            ("sample1", "contig2", 0, 93, 2, false, 61000), // group 93, in_group_id 2
1309            ("sample1", "contig2", 1, 93, 3, false, 61000), // group 93, in_group_id 3
1310            ("sample2", "contig1", 0, 93, 4, false, 61000), // group 93, in_group_id 4
1311            ("sample2", "contig1", 1, 93, 5, false, 61000), // group 93, in_group_id 5
1312            ("sample2", "contig1", 2, 93, 6, false, 61000), // group 93, in_group_id 6
1313        ];
1314
1315        for (sample, contig, place, group_id, in_group_id, is_rev_comp, raw_len) in &test_segments {
1316            coll.add_segment_placed(
1317                sample,
1318                contig,
1319                *place,
1320                *group_id,
1321                *in_group_id,
1322                *is_rev_comp,
1323                *raw_len,
1324            )
1325            .unwrap();
1326        }
1327
1328        // Serialize the collection details
1329        let serialized = coll.serialize_contig_details(0, 2);
1330
1331        // Create a new collection and deserialize
1332        let mut coll2 = CollectionV3::new();
1333        coll2.set_config(60000, 31, None);
1334
1335        // Must set up the same sample structure
1336        coll2.register_sample_contig("sample1", "contig1").unwrap();
1337        coll2.register_sample_contig("sample1", "contig2").unwrap();
1338        coll2.register_sample_contig("sample2", "contig1").unwrap();
1339
1340        // Deserialize
1341        coll2.deserialize_contig_details(&serialized, 0).unwrap();
1342
1343        // Verify all in_group_ids match original values
1344        let sample1_contig1 = coll2.get_contig_desc("sample1", "contig1").unwrap();
1345        assert_eq!(
1346            sample1_contig1.len(),
1347            2,
1348            "sample1/contig1 should have 2 segments"
1349        );
1350        assert_eq!(
1351            sample1_contig1[0].in_group_id, 0,
1352            "sample1/contig1 segment 0 should have in_group_id=0"
1353        );
1354        assert_eq!(
1355            sample1_contig1[1].in_group_id, 1,
1356            "sample1/contig1 segment 1 should have in_group_id=1"
1357        );
1358
1359        let sample1_contig2 = coll2.get_contig_desc("sample1", "contig2").unwrap();
1360        assert_eq!(
1361            sample1_contig2.len(),
1362            2,
1363            "sample1/contig2 should have 2 segments"
1364        );
1365        assert_eq!(
1366            sample1_contig2[0].in_group_id, 2,
1367            "sample1/contig2 segment 0 should have in_group_id=2"
1368        );
1369        assert_eq!(
1370            sample1_contig2[1].in_group_id, 3,
1371            "sample1/contig2 segment 1 should have in_group_id=3"
1372        );
1373
1374        let sample2_contig1 = coll2.get_contig_desc("sample2", "contig1").unwrap();
1375        assert_eq!(
1376            sample2_contig1.len(),
1377            3,
1378            "sample2/contig1 should have 3 segments"
1379        );
1380        assert_eq!(
1381            sample2_contig1[0].in_group_id, 4,
1382            "sample2/contig1 segment 0 should have in_group_id=4"
1383        );
1384        assert_eq!(
1385            sample2_contig1[1].in_group_id, 5,
1386            "sample2/contig1 segment 1 should have in_group_id=5"
1387        );
1388        assert_eq!(
1389            sample2_contig1[2].in_group_id, 6,
1390            "sample2/contig1 segment 2 should have in_group_id=6"
1391        );
1392    }
1393}