ragc_common/
collection.rs

1// Collection metadata structures
2// Manages sample names, contig names, and segment descriptors
3
4use crate::Archive;
5use anyhow::{Context, Result};
6use std::collections::HashMap;
7
8/// A segment descriptor identifying a compressed segment
9///
10/// # Architecture
11///
12/// Segments are organized into groups by terminal k-mers (front and back).
13/// Each group has two streams:
14/// - **ref_stream** (xGr): Contains reference segment (in_group_id = 0)
15/// - **delta_stream** (xGd): Contains LZ-encoded segments (in_group_id = 1, 2, 3, ...)
16///
17/// For groups >= 16 (LZ-enabled):
18/// - Reference is stored raw in ref_stream
19/// - Other segments are LZ-encoded against reference
20///
21/// For groups 0-15 (raw-only):
22/// - No LZ encoding, all segments stored raw
23///
24/// # Critical: in_group_id Indexing
25///
26/// - **in_group_id = 0**: ALWAYS the reference segment (in ref_stream)
27/// - **in_group_id >= 1**: Delta segments (in delta_stream)
28///
29/// IMPORTANT: Do NOT store reference in delta_stream! It must be:
30/// 1. Written to ref_stream
31/// 2. Registered with in_group_id = 0
32/// 3. Not included in delta pack
33#[derive(Debug, Clone, Copy, PartialEq)]
34pub struct SegmentDesc {
35    pub group_id: u32,
36    pub in_group_id: u32, // 0 = reference, 1+ = deltas
37    pub is_rev_comp: bool,
38    pub raw_length: u32,
39}
40
41impl SegmentDesc {
42    pub fn new(group_id: u32, in_group_id: u32, is_rev_comp: bool, raw_length: u32) -> Self {
43        SegmentDesc {
44            group_id,
45            in_group_id,
46            is_rev_comp,
47            raw_length,
48        }
49    }
50
51    pub fn empty() -> Self {
52        SegmentDesc {
53            group_id: u32::MAX,
54            in_group_id: u32::MAX,
55            is_rev_comp: false,
56            raw_length: 0,
57        }
58    }
59}
60
61/// A contig with its name and segments
62#[derive(Debug, Clone)]
63pub struct ContigDesc {
64    pub name: String,
65    pub segments: Vec<SegmentDesc>,
66}
67
68impl ContigDesc {
69    pub fn new(name: String) -> Self {
70        ContigDesc {
71            name,
72            segments: Vec::new(),
73        }
74    }
75}
76
77/// A sample with its name and contigs
78#[derive(Debug, Clone)]
79pub struct SampleDesc {
80    pub name: String,
81    pub contigs: Vec<ContigDesc>,
82}
83
84impl SampleDesc {
85    pub fn new(name: String) -> Self {
86        SampleDesc {
87            name,
88            contigs: Vec::new(),
89        }
90    }
91}
92
93/// Custom variable-length integer encoding for Collection metadata
94/// Different from standard varint - uses prefix bits to indicate length
95///
96/// Format:
97/// - 0-127:         1 byte  (0xxxxxxx)
98/// - 128-16511:     2 bytes (10xxxxxx xxxxxxxx)
99/// - 16512-2113663: 3 bytes (110xxxxx xxxxxxxx xxxxxxxx)
100/// - etc.
101pub struct CollectionVarInt;
102
103impl CollectionVarInt {
104    const THR_1: u32 = 1 << 7;
105    const THR_2: u32 = Self::THR_1 + (1 << 14);
106    const THR_3: u32 = Self::THR_2 + (1 << 21);
107    const THR_4: u32 = Self::THR_3 + (1 << 28);
108
109    const PREF_1: u8 = 0;
110    const PREF_2: u8 = 0b10000000u8;
111    const PREF_3: u8 = 0b11000000u8;
112    const PREF_4: u8 = 0b11100000u8;
113    const PREF_5: u8 = 0b11110000u8;
114
115    const MASK_1: u8 = 0b10000000u8;
116    const MASK_2: u8 = 0b11000000u8;
117    const MASK_3: u8 = 0b11100000u8;
118    const MASK_4: u8 = 0b11110000u8;
119
120    pub fn encode(data: &mut Vec<u8>, num: u32) {
121        if num < Self::THR_1 {
122            data.push(Self::PREF_1 + num as u8);
123        } else if num < Self::THR_2 {
124            let num = num - Self::THR_1;
125            data.push(Self::PREF_2 + (num >> 8) as u8);
126            data.push((num & 0xff) as u8);
127        } else if num < Self::THR_3 {
128            let num = num - Self::THR_2;
129            data.push(Self::PREF_3 + (num >> 16) as u8);
130            data.push(((num >> 8) & 0xff) as u8);
131            data.push((num & 0xff) as u8);
132        } else if num < Self::THR_4 {
133            let num = num - Self::THR_3;
134            data.push(Self::PREF_4 + (num >> 24) as u8);
135            data.push(((num >> 16) & 0xff) as u8);
136            data.push(((num >> 8) & 0xff) as u8);
137            data.push((num & 0xff) as u8);
138        } else {
139            let num = num - Self::THR_4;
140            data.push(Self::PREF_5);
141            data.push(((num >> 24) & 0xff) as u8);
142            data.push(((num >> 16) & 0xff) as u8);
143            data.push(((num >> 8) & 0xff) as u8);
144            data.push((num & 0xff) as u8);
145        }
146    }
147
148    pub fn decode(ptr: &mut &[u8]) -> Result<u32> {
149        if ptr.is_empty() {
150            anyhow::bail!("Unexpected end of data while decoding varint");
151        }
152
153        let first = ptr[0];
154
155        if (first & Self::MASK_1) == Self::PREF_1 {
156            let num = (first - Self::PREF_1) as u32;
157            *ptr = &ptr[1..];
158            Ok(num)
159        } else if (first & Self::MASK_2) == Self::PREF_2 {
160            if ptr.len() < 2 {
161                anyhow::bail!("Unexpected end of data while decoding 2-byte varint");
162            }
163            let num =
164                ((ptr[0] as u32) << 8) + ptr[1] as u32 + Self::THR_1 - ((Self::PREF_2 as u32) << 8);
165            *ptr = &ptr[2..];
166            Ok(num)
167        } else if (first & Self::MASK_3) == Self::PREF_3 {
168            if ptr.len() < 3 {
169                anyhow::bail!("Unexpected end of data while decoding 3-byte varint");
170            }
171            let num =
172                ((ptr[0] as u32) << 16) + ((ptr[1] as u32) << 8) + ptr[2] as u32 + Self::THR_2
173                    - ((Self::PREF_3 as u32) << 16);
174            *ptr = &ptr[3..];
175            Ok(num)
176        } else if (first & Self::MASK_4) == Self::PREF_4 {
177            if ptr.len() < 4 {
178                anyhow::bail!("Unexpected end of data while decoding 4-byte varint");
179            }
180            let num = ((ptr[0] as u32) << 24)
181                + ((ptr[1] as u32) << 16)
182                + ((ptr[2] as u32) << 8)
183                + ptr[3] as u32
184                + Self::THR_3
185                - ((Self::PREF_4 as u32) << 24);
186            *ptr = &ptr[4..];
187            Ok(num)
188        } else {
189            // PREF_5
190            if ptr.len() < 5 {
191                anyhow::bail!("Unexpected end of data while decoding 5-byte varint");
192            }
193            let mut num = ptr[1] as u32;
194            num <<= 8;
195            num += ptr[2] as u32;
196            num <<= 8;
197            num += ptr[3] as u32;
198            num <<= 8;
199            num += ptr[4] as u32;
200            num += Self::THR_4;
201            *ptr = &ptr[5..];
202            Ok(num)
203        }
204    }
205
206    pub fn encode_string(data: &mut Vec<u8>, s: &str) {
207        data.extend_from_slice(s.as_bytes());
208        data.push(0);
209    }
210
211    pub fn decode_string(ptr: &mut &[u8]) -> Result<String> {
212        let end = ptr
213            .iter()
214            .position(|&b| b == 0)
215            .context("Null terminator not found in string")?;
216
217        let s = String::from_utf8(ptr[..end].to_vec()).context("Invalid UTF-8 in string")?;
218
219        *ptr = &ptr[end + 1..];
220        Ok(s)
221    }
222}
223
224/// Zigzag encoding for signed differences
225pub fn zigzag_encode_i64(x: i64) -> u64 {
226    if x >= 0 {
227        (2 * x) as u64
228    } else {
229        (2 * (-x) - 1) as u64
230    }
231}
232
233pub fn zigzag_decode_i64(x: u64) -> i64 {
234    if x & 1 != 0 {
235        -(x.div_ceil(2) as i64)
236    } else {
237        (x / 2) as i64
238    }
239}
240
241/// Zigzag encoding with prediction
242pub fn zigzag_encode(x_curr: u64, x_prev: u64) -> u64 {
243    if x_curr < x_prev {
244        2 * (x_prev - x_curr) - 1
245    } else if x_curr < 2 * x_prev {
246        2 * (x_curr - x_prev)
247    } else {
248        x_curr
249    }
250}
251
252pub fn zigzag_decode(x_val: u64, x_prev: u64) -> u64 {
253    if x_val >= 2 * x_prev {
254        x_val
255    } else if x_val & 1 != 0 {
256        (2 * x_prev - x_val) / 2
257    } else {
258        (x_val + 2 * x_prev) / 2
259    }
260}
261
262/// Collection V3 - manages metadata for AGC archives
263pub struct CollectionV3 {
264    sample_desc: Vec<SampleDesc>,
265    sample_ids: HashMap<String, usize>,
266
267    // Archive stream IDs
268    collection_samples_id: Option<usize>,
269    collection_contigs_id: Option<usize>,
270    collection_details_id: Option<usize>,
271
272    // Configuration
273    batch_size: usize,
274    segment_size: u32,
275    kmer_length: u32,
276
277    // State tracking
278    prev_sample_name: String,
279    #[allow(dead_code)]
280    placing_sample_name: String,
281    #[allow(dead_code)]
282    placing_sample_id: usize,
283    no_samples_in_last_batch: usize,
284    /// Cumulative count of samples loaded from batches (for multi-batch archives)
285    samples_loaded: usize,
286
287    // For in_group_id delta encoding
288    in_group_ids: Vec<i32>,
289}
290
291impl Default for CollectionV3 {
292    fn default() -> Self {
293        Self::new()
294    }
295}
296
297impl CollectionV3 {
298    pub fn new() -> Self {
299        CollectionV3 {
300            sample_desc: Vec::new(),
301            sample_ids: HashMap::new(),
302            collection_samples_id: None,
303            collection_contigs_id: None,
304            collection_details_id: None,
305            batch_size: 1 << 20, // 1MB batches
306            segment_size: 0,
307            kmer_length: 0,
308            prev_sample_name: String::new(),
309            placing_sample_name: String::new(),
310            placing_sample_id: 0,
311            no_samples_in_last_batch: 0,
312            samples_loaded: 0,
313            in_group_ids: Vec::new(),
314        }
315    }
316
317    pub fn set_config(&mut self, segment_size: u32, kmer_length: u32, batch_size: Option<usize>) {
318        self.segment_size = segment_size;
319        self.kmer_length = kmer_length;
320        if let Some(bs) = batch_size {
321            self.batch_size = bs;
322        }
323    }
324
325    pub fn prepare_for_compression(&mut self, archive: &mut Archive) -> Result<()> {
326        self.collection_samples_id = Some(archive.register_stream("collection-samples"));
327        self.collection_contigs_id = Some(archive.register_stream("collection-contigs"));
328        self.collection_details_id = Some(archive.register_stream("collection-details"));
329        Ok(())
330    }
331
332    pub fn prepare_for_decompression(&mut self, archive: &Archive) -> Result<()> {
333        self.collection_samples_id = archive.get_stream_id("collection-samples");
334        self.collection_contigs_id = archive.get_stream_id("collection-contigs");
335        self.collection_details_id = archive.get_stream_id("collection-details");
336
337        if self.collection_samples_id.is_none() {
338            anyhow::bail!("collection-samples stream not found in archive");
339        }
340        if self.collection_contigs_id.is_none() {
341            anyhow::bail!("collection-contigs stream not found in archive");
342        }
343        if self.collection_details_id.is_none() {
344            anyhow::bail!("collection-details stream not found in archive");
345        }
346
347        Ok(())
348    }
349
350    /// Register a sample and contig (idempotent - adds contig to existing sample if needed)
351    pub fn register_sample_contig(&mut self, sample_name: &str, contig_name: &str) -> Result<bool> {
352        let mut stored_sample_name = sample_name.to_string();
353
354        if sample_name.is_empty() {
355            stored_sample_name = Self::extract_contig_name(contig_name);
356        }
357
358        // Get or create sample
359        let sample_id = if let Some(&id) = self.sample_ids.get(&stored_sample_name) {
360            // Sample already exists
361            id
362        } else {
363            // New sample
364            let id = self.sample_ids.len();
365            self.sample_ids.insert(stored_sample_name.clone(), id);
366            self.sample_desc
367                .push(SampleDesc::new(stored_sample_name.clone()));
368            id
369        };
370
371        self.prev_sample_name = stored_sample_name;
372
373        // Add contig to the sample (avoid duplicates)
374        let sample = &mut self.sample_desc[sample_id];
375        if !sample.contigs.iter().any(|c| c.name == contig_name) {
376            sample
377                .contigs
378                .push(ContigDesc::new(contig_name.to_string()));
379            Ok(true)
380        } else {
381            Ok(false) // Contig already registered
382        }
383    }
384
385    /// Add segment placement information
386    #[allow(clippy::too_many_arguments)]
387    pub fn add_segment_placed(
388        &mut self,
389        sample_name: &str,
390        contig_name: &str,
391        place: usize,
392        group_id: u32,
393        in_group_id: u32,
394        is_rev_comp: bool,
395        raw_length: u32,
396    ) -> Result<()> {
397        let mut stored_sample_name = sample_name.to_string();
398
399        if sample_name.is_empty() {
400            stored_sample_name = Self::extract_contig_name(contig_name);
401        }
402
403        // BUGFIX: Always lookup sample_id fresh instead of caching
404        // The cache was causing segments to be registered to the wrong sample
405        // when GroupWriters buffer segments across sample boundaries
406        let sample_id = *self
407            .sample_ids
408            .get(&stored_sample_name)
409            .context(format!("Sample not found: {stored_sample_name}"))?;
410
411        if std::env::var("RAGC_DEBUG_REGISTER").is_ok() {
412            eprintln!("REGISTER: sample='{stored_sample_name}' (id={sample_id}), contig='{contig_name}', place={place}");
413        }
414
415        let sample = &mut self.sample_desc[sample_id];
416        for contig in &mut sample.contigs {
417            if contig.name == contig_name {
418                if place >= contig.segments.len() {
419                    contig.segments.resize(place + 1, SegmentDesc::empty());
420                }
421                contig.segments[place] =
422                    SegmentDesc::new(group_id, in_group_id, is_rev_comp, raw_length);
423                return Ok(());
424            }
425        }
426
427        anyhow::bail!("Contig {contig_name} not found in sample {stored_sample_name}");
428    }
429
430    /// Get list of samples
431    pub fn get_samples_list(&self, sorted: bool) -> Vec<String> {
432        let mut samples: Vec<String> = self.sample_desc.iter().map(|s| s.name.clone()).collect();
433        if sorted {
434            samples.sort();
435        }
436        samples
437    }
438
439    /// Get number of samples
440    pub fn get_no_samples(&self) -> usize {
441        self.sample_desc.len()
442    }
443
444    /// Get number of contigs in a sample
445    pub fn get_no_contigs(&self, sample_name: &str) -> Option<usize> {
446        self.sample_ids
447            .get(sample_name)
448            .map(|&id| self.sample_desc[id].contigs.len())
449    }
450
451    /// Get number of contig batches from archive
452    pub fn get_no_contig_batches(&self, archive: &Archive) -> Result<usize> {
453        let contig_stream_id = self
454            .collection_contigs_id
455            .context("collection-contigs stream not found")?;
456        Ok(archive.get_num_parts(contig_stream_id))
457    }
458
459    /// Get contig list for a sample
460    pub fn get_contig_list(&self, sample_name: &str) -> Option<Vec<String>> {
461        self.sample_ids.get(sample_name).map(|&id| {
462            self.sample_desc[id]
463                .contigs
464                .iter()
465                .map(|c| c.name.clone())
466                .collect()
467        })
468    }
469
470    /// Get sample descriptor with all contigs and segments
471    pub fn get_sample_desc(&self, sample_name: &str) -> Option<Vec<(String, Vec<SegmentDesc>)>> {
472        self.sample_ids.get(sample_name).map(|&id| {
473            self.sample_desc[id]
474                .contigs
475                .iter()
476                .map(|c| (c.name.clone(), c.segments.clone()))
477                .collect()
478        })
479    }
480
481    /// Get contig descriptor for a specific contig in a sample
482    pub fn get_contig_desc(
483        &self,
484        sample_name: &str,
485        contig_name: &str,
486    ) -> Option<Vec<SegmentDesc>> {
487        self.sample_ids.get(sample_name).and_then(|&id| {
488            self.sample_desc[id]
489                .contigs
490                .iter()
491                .find(|c| c.name == contig_name)
492                .map(|c| c.segments.clone())
493        })
494    }
495
496    /// Get configuration parameters (segment_size, kmer_length)
497    pub fn get_params(&self) -> (u32, u32) {
498        (self.segment_size, self.kmer_length)
499    }
500
501    /// Extract short contig name (first word)
502    fn extract_contig_name(s: &str) -> String {
503        s.split_whitespace().next().unwrap_or(s).to_string()
504    }
505
506    // Serialization/Deserialization methods
507
508    /// Serialize sample names
509    fn serialize_sample_names(&self) -> Vec<u8> {
510        let mut data = Vec::new();
511        CollectionVarInt::encode(&mut data, self.sample_desc.len() as u32);
512
513        for sample in &self.sample_desc {
514            CollectionVarInt::encode_string(&mut data, &sample.name);
515        }
516
517        data
518    }
519
520    /// Deserialize sample names
521    fn deserialize_sample_names(&mut self, data: &[u8]) -> Result<()> {
522        let mut ptr = data;
523
524        let no_samples = CollectionVarInt::decode(&mut ptr)?;
525
526        self.sample_desc.clear();
527        self.sample_ids.clear();
528
529        for i in 0..no_samples {
530            let name = CollectionVarInt::decode_string(&mut ptr)?;
531            self.sample_ids.insert(name.clone(), i as usize);
532            self.sample_desc.push(SampleDesc::new(name));
533        }
534
535        Ok(())
536    }
537
538    /// Split a string by spaces
539    fn split_string(s: &str) -> Vec<String> {
540        s.split(' ').map(|s| s.to_string()).collect()
541    }
542
543    /// Encode contig name using delta from previous name (returns raw bytes)
544    fn encode_split(prev_split: &[String], curr_split: &[String]) -> Vec<u8> {
545        let mut enc = Vec::new();
546
547        for i in 0..curr_split.len() {
548            if prev_split[i] == curr_split[i] {
549                enc.push((-127i8) as u8); // Same component marker (0x81 = 129)
550            } else if prev_split[i].len() != curr_split[i].len() {
551                enc.extend_from_slice(curr_split[i].as_bytes());
552            } else {
553                // Encode with run-length for identical characters
554                let mut cnt: i8 = 0;
555                let p_bytes = prev_split[i].as_bytes();
556                let c_bytes = curr_split[i].as_bytes();
557
558                for j in 0..c_bytes.len() {
559                    if p_bytes[j] == c_bytes[j] {
560                        if cnt == 100 {
561                            enc.push((-cnt) as u8); // Repetition marker
562                            cnt = 1;
563                        } else {
564                            cnt += 1;
565                        }
566                    } else {
567                        if cnt > 0 {
568                            enc.push((-cnt) as u8); // Repetition marker
569                            cnt = 0;
570                        }
571                        enc.push(c_bytes[j]);
572                    }
573                }
574
575                if cnt > 0 {
576                    enc.push((-cnt) as u8); // Repetition marker
577                }
578            }
579
580            enc.push(b' ');
581        }
582
583        // Remove final space
584        if enc.last() == Some(&b' ') {
585            enc.pop();
586        }
587
588        enc
589    }
590
591    /// Decode contig name using delta from previous name (works with byte vectors)
592    /// IMPORTANT: Updates curr_split in-place to contain decoded components (matches C++ AGC)
593    /// Returns the decoded string
594    #[allow(clippy::ptr_arg)]
595    fn decode_split_bytes(prev_split: &Vec<Vec<u8>>, curr_split: &mut Vec<Vec<u8>>) -> String {
596        let mut dec = Vec::new();
597
598        for i in 0..curr_split.len() {
599            if curr_split[i].len() == 1 && (curr_split[i][0] as i8) == -127 {
600                // Same component marker (-127 = byte 129)
601                dec.extend_from_slice(&prev_split[i]);
602                curr_split[i] = prev_split[i].clone();
603            } else {
604                let mut cmp = Vec::new();
605                let p_bytes = &prev_split[i];
606                let mut p_idx = 0;
607
608                for &byte in &curr_split[i] {
609                    let c = byte as i8;
610                    if c >= 0 {
611                        // Literal character
612                        cmp.push(byte);
613                        p_idx += 1;
614                    } else {
615                        // Repetition count (negative byte value)
616                        let count = -c as usize;
617                        cmp.extend_from_slice(&p_bytes[p_idx..p_idx + count]);
618                        p_idx += count;
619                    }
620                }
621
622                dec.extend_from_slice(&cmp);
623                curr_split[i] = cmp;
624            }
625
626            dec.push(b' ');
627        }
628
629        dec.pop(); // Remove final space
630        String::from_utf8(dec).expect("Invalid UTF-8 in decoded contig name")
631    }
632
633    /// Serialize contig names for a batch of samples
634    fn serialize_contig_names(&self, id_from: usize, id_to: usize) -> Vec<u8> {
635        let mut data = Vec::new();
636
637        CollectionVarInt::encode(&mut data, (id_to - id_from) as u32);
638
639        if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
640            eprintln!("SERIALIZE_CONTIG_NAMES: samples {id_from}..{id_to}");
641        }
642
643        for (sample_idx, sample) in self.sample_desc[id_from..id_to].iter().enumerate() {
644            CollectionVarInt::encode(&mut data, sample.contigs.len() as u32);
645
646            if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
647                eprintln!(
648                    "  Sample {} ({}): {} contigs",
649                    id_from + sample_idx,
650                    sample.name,
651                    sample.contigs.len()
652                );
653            }
654
655            let mut prev_split = Vec::new();
656
657            for (contig_idx, contig) in sample.contigs.iter().enumerate() {
658                let curr_split = Self::split_string(&contig.name);
659                let before_len = data.len();
660
661                if curr_split.len() != prev_split.len() {
662                    CollectionVarInt::encode_string(&mut data, &contig.name);
663
664                    if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
665                        eprintln!(
666                            "    Contig {}: '{}' (FULL) -> {} bytes",
667                            contig_idx,
668                            contig.name,
669                            data.len() - before_len
670                        );
671                    }
672                } else {
673                    let enc_bytes = Self::encode_split(&prev_split, &curr_split);
674                    // Write as raw bytes with null terminator
675                    data.extend_from_slice(&enc_bytes);
676                    data.push(0);
677
678                    if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
679                        eprintln!("    Contig {}: '{}' (DELTA prev='{}') -> enc_bytes={:?}, {} bytes total",
680                            contig_idx, contig.name, prev_split.join(" "), enc_bytes, data.len() - before_len);
681                    }
682                }
683
684                prev_split = curr_split;
685            }
686        }
687
688        if std::env::var("RAGC_DEBUG_CONTIG_NAMES").is_ok() {
689            eprintln!("  Total serialized size: {} bytes", data.len());
690        }
691
692        data
693    }
694
695    /// Decode raw bytes string (handles non-UTF8 bytes)
696    fn decode_bytes_string(ptr: &mut &[u8]) -> Result<Vec<u8>> {
697        let end = ptr
698            .iter()
699            .position(|&b| b == 0)
700            .context("Null terminator not found in bytes string")?;
701
702        let bytes = ptr[..end].to_vec();
703        *ptr = &ptr[end + 1..];
704        Ok(bytes)
705    }
706
707    /// Deserialize contig names for a batch of samples
708    fn deserialize_contig_names(&mut self, data: &[u8], i_sample: usize) -> Result<()> {
709        let mut ptr = data;
710
711        let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr)? as usize;
712
713        for i in 0..no_samples_in_curr_batch {
714            let no_contigs = CollectionVarInt::decode(&mut ptr)? as usize;
715
716            let curr_sample = &mut self.sample_desc[i_sample + i];
717            curr_sample.contigs.clear();
718            curr_sample.contigs.reserve(no_contigs);
719
720            let mut prev_split: Vec<Vec<u8>> = Vec::new();
721
722            for _ in 0..no_contigs {
723                let enc_bytes = Self::decode_bytes_string(&mut ptr)?;
724
725                // CRITICAL: Match C++ AGC logic exactly!
726                // C++ AGC (collection_v3.cpp lines 519-531):
727                //   read(p, enc);  // Read encoded string (can contain bytes > 127)
728                //   curr_split = split_string(enc);  // Split by spaces
729                //   if (curr_split.size() != prev_split.size())
730                //       curr_sample.contigs[j].name = enc;  // Plain text
731                //   else
732                //       curr_sample.contigs[j].name = decode_split(prev_split, curr_split);  // Delta
733                //   prev_split = move(curr_split);  // Update to current split
734                //
735                // KEY: C++ std::string can hold arbitrary bytes, including delta-encoded bytes >127
736                // In Rust, we use Vec<u8> to preserve delta bytes until after decoding
737
738                // Split by spaces (keep as bytes to preserve delta encoding)
739                let mut curr_split: Vec<Vec<u8>> = enc_bytes
740                    .split(|&b| b == b' ')
741                    .map(|s| s.to_vec())
742                    .collect();
743
744                let name = if prev_split.is_empty() || curr_split.len() != prev_split.len() {
745                    // Plain text: first contig OR different number of components
746                    String::from_utf8_lossy(&enc_bytes).to_string()
747                } else {
748                    // Delta-encoded: decode_split_bytes updates curr_split in-place!
749                    Self::decode_split_bytes(&prev_split, &mut curr_split)
750                };
751
752                // CRITICAL: Update prev_split to curr_split (which now contains decoded bytes)
753                prev_split = curr_split;
754
755                curr_sample.contigs.push(ContigDesc::new(name));
756            }
757        }
758
759        self.no_samples_in_last_batch = no_samples_in_curr_batch;
760
761        Ok(())
762    }
763
764    /// Clear in_group_ids tracking
765    fn clear_in_group_ids(&mut self) {
766        self.in_group_ids.clear();
767    }
768
769    /// Get in_group_id for a group
770    fn get_in_group_id(&self, pos: usize) -> i32 {
771        self.in_group_ids.get(pos).copied().unwrap_or(-1)
772    }
773
774    /// Set in_group_id for a group
775    fn set_in_group_id(&mut self, pos: usize, val: i32) {
776        if pos >= self.in_group_ids.len() {
777            self.in_group_ids
778                .resize((pos as f64 * 1.2) as usize + 1, -1);
779        }
780        self.in_group_ids[pos] = val;
781    }
782
783    /// Serialize contig details (5-stream format)
784    fn serialize_contig_details(&mut self, id_from: usize, id_to: usize) -> [Vec<u8>; 5] {
785        let mut v_data: [Vec<u8>; 5] = Default::default();
786
787        CollectionVarInt::encode(&mut v_data[0], (id_to - id_from) as u32);
788
789        self.clear_in_group_ids();
790
791        let pred_raw_length = self.segment_size + self.kmer_length;
792
793        // First pass: compute encoded values
794        // IMPORTANT: Update in_group_ids IMMEDIATELY after encoding each segment (matching C++ AGC)
795        let mut encoded_segments = Vec::new();
796
797        for sample_idx in id_from..id_to {
798            let mut sample_encoded = Vec::new();
799
800            for contig_idx in 0..self.sample_desc[sample_idx].contigs.len() {
801                let mut contig_encoded = Vec::new();
802
803                for seg_idx in 0..self.sample_desc[sample_idx].contigs[contig_idx]
804                    .segments
805                    .len()
806                {
807                    let seg = &self.sample_desc[sample_idx].contigs[contig_idx].segments[seg_idx];
808                    let prev_in_group_id = self.get_in_group_id(seg.group_id as usize);
809
810                    let e_group_id = seg.group_id;
811                    let e_in_group_id = if prev_in_group_id == -1 {
812                        seg.in_group_id
813                    } else if seg.in_group_id == 0 {
814                        0
815                    } else if seg.in_group_id as i32 == prev_in_group_id + 1 {
816                        1
817                    } else {
818                        zigzag_encode(seg.in_group_id as u64, (prev_in_group_id + 1) as u64) as u32
819                            + 1
820                    };
821
822                    let e_raw_length =
823                        zigzag_encode(seg.raw_length as u64, pred_raw_length as u64) as u32;
824
825                    // DEBUG: Print encoded values during serialization
826                    if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
827                        eprintln!(
828                            "ENCODE: group={}, in_group_id={}, prev={}, e_in_group_id={}",
829                            seg.group_id, seg.in_group_id, prev_in_group_id, e_in_group_id
830                        );
831                    }
832
833                    contig_encoded.push((e_group_id, e_in_group_id, e_raw_length, seg.is_rev_comp));
834
835                    // Apply update IMMEDIATELY (matching C++ AGC behavior at collection_v3.cpp:581-582)
836                    // NOTE: C++ AGC condition is: c_in_group_id > prev_in_group_id && c_in_group_id > 0
837                    // We must match this EXACTLY even though it seems wrong!
838                    if seg.in_group_id as i32 > prev_in_group_id && seg.in_group_id > 0 {
839                        self.set_in_group_id(seg.group_id as usize, seg.in_group_id as i32);
840                    }
841                }
842
843                sample_encoded.push(contig_encoded);
844            }
845
846            encoded_segments.push(sample_encoded);
847        }
848
849        // Second pass: write to output streams
850        if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
851            eprintln!(
852                "SERIALIZE: Writing {} samples to collection",
853                encoded_segments.len()
854            );
855        }
856
857        for (sample_idx, sample_encoded) in encoded_segments.iter().enumerate() {
858            let before_len = v_data[0].len();
859            CollectionVarInt::encode(&mut v_data[0], sample_encoded.len() as u32);
860            let after_len = v_data[0].len();
861
862            if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
863                eprintln!("  Sample {}: {} contigs", sample_idx, sample_encoded.len());
864                eprintln!(
865                    "    Wrote bytes [{}..{}]: {:?}",
866                    before_len,
867                    after_len,
868                    &v_data[0][before_len..after_len]
869                );
870            }
871
872            for (contig_idx, contig_encoded) in sample_encoded.iter().enumerate() {
873                let before_len = v_data[0].len();
874                CollectionVarInt::encode(&mut v_data[0], contig_encoded.len() as u32);
875                let after_len = v_data[0].len();
876
877                if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
878                    eprintln!(
879                        "    Contig {}: {} segments",
880                        contig_idx,
881                        contig_encoded.len()
882                    );
883                    eprintln!(
884                        "      Wrote bytes [{}..{}]: {:?}",
885                        before_len,
886                        after_len,
887                        &v_data[0][before_len..after_len]
888                    );
889                }
890
891                for &(e_group_id, e_in_group_id, e_raw_length, is_rev_comp) in contig_encoded {
892                    CollectionVarInt::encode(&mut v_data[1], e_group_id);
893                    CollectionVarInt::encode(&mut v_data[2], e_in_group_id);
894                    CollectionVarInt::encode(&mut v_data[3], e_raw_length);
895                    CollectionVarInt::encode(&mut v_data[4], is_rev_comp as u32);
896                }
897            }
898
899            if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
900                let total_segments: usize = sample_encoded.iter().map(|c| c.len()).sum();
901                eprintln!("  Sample {sample_idx} total: {total_segments} segments");
902            }
903        }
904
905        if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
906            eprintln!(
907                "Stream sizes: [0]={}, [1]={}, [2]={}, [3]={}, [4]={}",
908                v_data[0].len(),
909                v_data[1].len(),
910                v_data[2].len(),
911                v_data[3].len(),
912                v_data[4].len()
913            );
914        }
915
916        v_data
917    }
918
919    /// Deserialize contig details (5-stream format)
920    fn deserialize_contig_details(&mut self, v_data: &[Vec<u8>; 5], i_sample: usize) -> Result<()> {
921        let mut ptr0 = v_data[0].as_slice();
922
923        let no_samples_in_curr_batch = CollectionVarInt::decode(&mut ptr0)? as usize;
924
925        if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
926            eprintln!("DESERIALIZE: Reading {no_samples_in_curr_batch} samples from collection");
927        }
928
929        // First pass: decode structure and counts
930        let mut structure = Vec::new();
931        for sample_idx in 0..no_samples_in_curr_batch {
932            let no_contigs = CollectionVarInt::decode(&mut ptr0)? as usize;
933            let mut contig_seg_counts = Vec::new();
934
935            if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
936                eprintln!("  Sample {sample_idx}: {no_contigs} contigs");
937            }
938
939            for contig_idx in 0..no_contigs {
940                let no_segments = CollectionVarInt::decode(&mut ptr0)? as usize;
941                contig_seg_counts.push(no_segments);
942
943                if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
944                    eprintln!("    Contig {contig_idx}: {no_segments} segments");
945                }
946            }
947
948            structure.push(contig_seg_counts);
949        }
950
951        // Decode detail streams
952        let mut v_det: [Vec<u32>; 5] = Default::default();
953        let mut no_items = 0;
954
955        for counts in &structure {
956            for &count in counts {
957                no_items += count;
958            }
959        }
960
961        for i in 1..5 {
962            v_det[i].reserve(no_items);
963            let mut ptr = v_data[i].as_slice();
964
965            for _ in 0..no_items {
966                v_det[i].push(CollectionVarInt::decode(&mut ptr)?);
967            }
968        }
969
970        // Reconstruct segments - first pass: decode all values
971        self.clear_in_group_ids();
972
973        let pred_raw_length = self.segment_size + self.kmer_length;
974        let mut item_idx = 0;
975
976        // Collect all decoded segments first
977        // IMPORTANT: Update in_group_ids IMMEDIATELY after decoding each segment (matching C++ AGC)
978        let mut decoded_segments = Vec::new();
979
980        for contig_seg_counts in &structure {
981            let mut sample_segs = Vec::new();
982
983            for &no_segments in contig_seg_counts {
984                let mut contig_segs = Vec::new();
985
986                for _ in 0..no_segments {
987                    let c_group_id = v_det[1][item_idx];
988                    let prev_in_group_id = self.get_in_group_id(c_group_id as usize);
989
990                    let e_in_group_id = v_det[2][item_idx];
991                    let c_in_group_id = if prev_in_group_id == -1 {
992                        e_in_group_id
993                    } else if e_in_group_id == 0 {
994                        0
995                    } else if e_in_group_id == 1 {
996                        (prev_in_group_id + 1) as u32
997                    } else {
998                        zigzag_decode(e_in_group_id as u64 - 1, (prev_in_group_id + 1) as u64)
999                            as u32
1000                    };
1001
1002                    // DEBUG: Print encoded vs decoded values for investigation
1003                    if std::env::var("RAGC_DEBUG_COLLECTION").is_ok() {
1004                        eprintln!("DECODE: item={item_idx}, group={c_group_id}, e_in_group_id={e_in_group_id}, prev={prev_in_group_id}, c_in_group_id={c_in_group_id}");
1005                    }
1006
1007                    let c_raw_length =
1008                        zigzag_decode(v_det[3][item_idx] as u64, pred_raw_length as u64) as u32;
1009                    let c_is_rev_comp = v_det[4][item_idx] != 0;
1010
1011                    contig_segs.push(SegmentDesc::new(
1012                        c_group_id,
1013                        c_in_group_id,
1014                        c_is_rev_comp,
1015                        c_raw_length,
1016                    ));
1017
1018                    // Apply update IMMEDIATELY (matching C++ AGC behavior at collection_v3.cpp:674-675)
1019                    // NOTE: C++ AGC condition is: c_in_group_id > prev_in_group_id && c_in_group_id > 0
1020                    // We must match this EXACTLY even though it causes issues with multiple samples per group!
1021                    if c_in_group_id as i32 > prev_in_group_id && c_in_group_id > 0 {
1022                        self.set_in_group_id(c_group_id as usize, c_in_group_id as i32);
1023                    }
1024
1025                    item_idx += 1;
1026                }
1027
1028                sample_segs.push(contig_segs);
1029            }
1030
1031            decoded_segments.push(sample_segs);
1032        }
1033
1034        // Second pass: assign to sample_desc
1035        for (i, sample_segs) in decoded_segments.into_iter().enumerate() {
1036            let curr_sample = &mut self.sample_desc[i_sample + i];
1037
1038            for (j, contig_segs) in sample_segs.into_iter().enumerate() {
1039                curr_sample.contigs[j].segments = contig_segs;
1040            }
1041        }
1042
1043        Ok(())
1044    }
1045
1046    /// Store sample names batch (with ZSTD compression)
1047    pub fn store_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1048        let v_tmp = self.serialize_sample_names();
1049
1050        let v_data = zstd::encode_all(&v_tmp[..], 19).context("Failed to compress sample names")?;
1051
1052        let stream_id = self
1053            .collection_samples_id
1054            .context("collection-samples stream not registered")?;
1055
1056        archive.add_part_buffered(stream_id, v_data, v_tmp.len() as u64);
1057
1058        Ok(())
1059    }
1060
1061    /// Load sample names batch (with ZSTD decompression)
1062    pub fn load_batch_sample_names(&mut self, archive: &mut Archive) -> Result<()> {
1063        let stream_id = self
1064            .collection_samples_id
1065            .context("collection-samples stream not found")?;
1066
1067        let (v_tmp, raw_size) = archive
1068            .get_part(stream_id)?
1069            .context("No sample names batch found")?;
1070
1071        let v_data = zstd::decode_all(&v_tmp[..]).context("Failed to decompress sample names")?;
1072
1073        if v_data.len() != raw_size as usize {
1074            anyhow::bail!(
1075                "Decompressed size mismatch: expected {}, got {}",
1076                raw_size,
1077                v_data.len()
1078            );
1079        }
1080
1081        self.deserialize_sample_names(&v_data)?;
1082
1083        Ok(())
1084    }
1085
1086    /// Load a batch of contigs (names + details)
1087    #[allow(clippy::needless_range_loop)]
1088    pub fn load_contig_batch(&mut self, archive: &mut Archive, id_batch: usize) -> Result<()> {
1089        // Use cumulative samples_loaded counter, NOT id_batch * batch_size
1090        // C++ AGC creates batches of ~50 samples, but batch_size defaults to 1M which is wrong
1091        let i_sample = self.samples_loaded;
1092
1093        // Load contig names
1094        let contig_stream_id = self
1095            .collection_contigs_id
1096            .context("collection-contigs stream not found")?;
1097
1098        let (v_tmp_names, raw_size_names) = archive.get_part_by_id(contig_stream_id, id_batch)?;
1099        let v_data_names =
1100            zstd::decode_all(&v_tmp_names[..]).context("Failed to decompress contig names")?;
1101
1102        if v_data_names.len() != raw_size_names as usize {
1103            anyhow::bail!(
1104                "Decompressed size mismatch for contig names: expected {}, got {}",
1105                raw_size_names,
1106                v_data_names.len()
1107            );
1108        }
1109
1110        self.deserialize_contig_names(&v_data_names, i_sample)?;
1111
1112        // Load contig details
1113        let details_stream_id = self
1114            .collection_details_id
1115            .context("collection-details stream not found")?;
1116
1117        let (v_stream, _) = archive.get_part_by_id(details_stream_id, id_batch)?;
1118
1119        // Unpack the 5 streams
1120        let mut ptr = v_stream.as_slice();
1121        let mut sizes: [(u32, u32); 5] = Default::default();
1122
1123        for i in 0..5 {
1124            sizes[i].0 = CollectionVarInt::decode(&mut ptr)?; // raw size
1125            sizes[i].1 = CollectionVarInt::decode(&mut ptr)?; // compressed size
1126        }
1127
1128        let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1129        for i in 0..5 {
1130            v_data_details_compressed[i] = ptr[..sizes[i].1 as usize].to_vec();
1131            ptr = &ptr[sizes[i].1 as usize..];
1132        }
1133
1134        // Decompress each stream
1135        let mut v_data_details: [Vec<u8>; 5] = Default::default();
1136        for i in 0..5 {
1137            v_data_details[i] = zstd::decode_all(&v_data_details_compressed[i][..])
1138                .context(format!("Failed to decompress contig details stream {i}"))?;
1139
1140            if v_data_details[i].len() != sizes[i].0 as usize {
1141                anyhow::bail!(
1142                    "Decompressed size mismatch for details stream {}: expected {}, got {}",
1143                    i,
1144                    sizes[i].0,
1145                    v_data_details[i].len()
1146                );
1147            }
1148        }
1149
1150        self.deserialize_contig_details(&v_data_details, i_sample)?;
1151
1152        // Update cumulative counter for next batch
1153        self.samples_loaded += self.no_samples_in_last_batch;
1154
1155        Ok(())
1156    }
1157
1158    /// Store a batch of contigs (names + details)
1159    #[allow(clippy::needless_range_loop)]
1160    pub fn store_contig_batch(
1161        &mut self,
1162        archive: &mut Archive,
1163        id_from: usize,
1164        id_to: usize,
1165    ) -> Result<()> {
1166        // Store contig names
1167        let v_tmp_names = self.serialize_contig_names(id_from, id_to);
1168        let v_data_names =
1169            zstd::encode_all(&v_tmp_names[..], 18).context("Failed to compress contig names")?;
1170
1171        let contig_stream_id = self
1172            .collection_contigs_id
1173            .context("collection-contigs stream not registered")?;
1174
1175        archive.add_part_buffered(contig_stream_id, v_data_names, v_tmp_names.len() as u64);
1176
1177        // Store contig details (5 streams)
1178        let v_data_details_raw = self.serialize_contig_details(id_from, id_to);
1179
1180        let mut v_data_details_compressed: [Vec<u8>; 5] = Default::default();
1181        for i in 0..5 {
1182            v_data_details_compressed[i] = zstd::encode_all(&v_data_details_raw[i][..], 19)
1183                .context(format!("Failed to compress contig details stream {i}"))?;
1184        }
1185
1186        // Pack into single stream
1187        let mut v_stream = Vec::new();
1188        for i in 0..5 {
1189            CollectionVarInt::encode(&mut v_stream, v_data_details_raw[i].len() as u32);
1190            CollectionVarInt::encode(&mut v_stream, v_data_details_compressed[i].len() as u32);
1191        }
1192        for i in 0..5 {
1193            v_stream.extend_from_slice(&v_data_details_compressed[i]);
1194        }
1195
1196        let details_stream_id = self
1197            .collection_details_id
1198            .context("collection-details stream not registered")?;
1199
1200        archive.add_part_buffered(details_stream_id, v_stream, 0);
1201
1202        // Clear contigs from memory (for large datasets)
1203        for sample in &mut self.sample_desc[id_from..id_to] {
1204            sample.contigs.clear();
1205            sample.contigs.shrink_to_fit();
1206        }
1207
1208        Ok(())
1209    }
1210}
1211
1212#[cfg(test)]
1213mod tests {
1214    use super::*;
1215
1216    #[test]
1217    fn test_collection_varint_1byte() {
1218        let mut data = Vec::new();
1219        CollectionVarInt::encode(&mut data, 0);
1220        CollectionVarInt::encode(&mut data, 42);
1221        CollectionVarInt::encode(&mut data, 127);
1222
1223        let mut ptr = data.as_slice();
1224        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 0);
1225        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 42);
1226        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 127);
1227    }
1228
1229    #[test]
1230    fn test_collection_varint_2byte() {
1231        let mut data = Vec::new();
1232        CollectionVarInt::encode(&mut data, 128);
1233        CollectionVarInt::encode(&mut data, 255);
1234        CollectionVarInt::encode(&mut data, 16511);
1235
1236        let mut ptr = data.as_slice();
1237        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 128);
1238        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 255);
1239        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16511);
1240    }
1241
1242    #[test]
1243    fn test_collection_varint_3byte() {
1244        let mut data = Vec::new();
1245        CollectionVarInt::encode(&mut data, 16512);
1246        CollectionVarInt::encode(&mut data, 65536);
1247        CollectionVarInt::encode(&mut data, 2113663);
1248
1249        let mut ptr = data.as_slice();
1250        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 16512);
1251        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 65536);
1252        assert_eq!(CollectionVarInt::decode(&mut ptr).unwrap(), 2113663);
1253    }
1254
1255    #[test]
1256    fn test_collection_varint_string() {
1257        let mut data = Vec::new();
1258        CollectionVarInt::encode_string(&mut data, "Hello");
1259        CollectionVarInt::encode_string(&mut data, "World");
1260
1261        let mut ptr = data.as_slice();
1262        assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "Hello");
1263        assert_eq!(CollectionVarInt::decode_string(&mut ptr).unwrap(), "World");
1264    }
1265
1266    #[test]
1267    fn test_zigzag_i64() {
1268        assert_eq!(zigzag_encode_i64(0), 0);
1269        assert_eq!(zigzag_encode_i64(1), 2);
1270        assert_eq!(zigzag_encode_i64(-1), 1);
1271        assert_eq!(zigzag_encode_i64(2), 4);
1272        assert_eq!(zigzag_encode_i64(-2), 3);
1273
1274        assert_eq!(zigzag_decode_i64(0), 0);
1275        assert_eq!(zigzag_decode_i64(2), 1);
1276        assert_eq!(zigzag_decode_i64(1), -1);
1277        assert_eq!(zigzag_decode_i64(4), 2);
1278        assert_eq!(zigzag_decode_i64(3), -2);
1279    }
1280
1281    #[test]
1282    fn test_zigzag_predictive() {
1283        // Test predictive zigzag encoding
1284        assert_eq!(zigzag_encode(100, 100), 0); // Same as previous
1285        assert_eq!(zigzag_encode(101, 100), 2); // Slightly above
1286        assert_eq!(zigzag_encode(99, 100), 1); // Slightly below
1287        assert_eq!(zigzag_encode(200, 100), 200); // Far away
1288
1289        assert_eq!(zigzag_decode(0, 100), 100);
1290        assert_eq!(zigzag_decode(2, 100), 101);
1291        assert_eq!(zigzag_decode(1, 100), 99);
1292        assert_eq!(zigzag_decode(200, 100), 200);
1293    }
1294
1295    #[test]
1296    fn test_segment_desc() {
1297        let seg = SegmentDesc::new(42, 7, true, 1000);
1298        assert_eq!(seg.group_id, 42);
1299        assert_eq!(seg.in_group_id, 7);
1300        assert!(seg.is_rev_comp);
1301        assert_eq!(seg.raw_length, 1000);
1302    }
1303
1304    #[test]
1305    fn test_collection_register() {
1306        let mut coll = CollectionV3::new();
1307
1308        assert!(coll.register_sample_contig("sample1", "contig1").unwrap());
1309        assert!(coll.register_sample_contig("sample1", "contig2").unwrap());
1310        assert!(coll.register_sample_contig("sample2", "contig1").unwrap());
1311
1312        assert_eq!(coll.get_no_samples(), 2);
1313
1314        let samples = coll.get_samples_list(false);
1315        assert_eq!(samples, vec!["sample1", "sample2"]);
1316    }
1317
1318    /// Test that in_group_id delta encoding/decoding works correctly
1319    /// This specifically tests the bug where deserialize updates in_group_id
1320    /// during the loop instead of after (like serialize does)
1321    #[test]
1322    fn test_in_group_id_delta_encoding_roundtrip() {
1323        let mut coll = CollectionV3::new();
1324        coll.set_config(60000, 31, None);
1325
1326        // Register samples and contigs
1327        coll.register_sample_contig("sample1", "contig1").unwrap();
1328        coll.register_sample_contig("sample1", "contig2").unwrap();
1329        coll.register_sample_contig("sample2", "contig1").unwrap();
1330
1331        // Add multiple segments from the same group (group 93) with consecutive in_group_ids
1332        // This reproduces the real-world scenario where multiple contigs have segments in the same group
1333        let test_segments = vec![
1334            ("sample1", "contig1", 0, 93, 0, false, 61000), // group 93, in_group_id 0
1335            ("sample1", "contig1", 1, 93, 1, false, 61000), // group 93, in_group_id 1
1336            ("sample1", "contig2", 0, 93, 2, false, 61000), // group 93, in_group_id 2
1337            ("sample1", "contig2", 1, 93, 3, false, 61000), // group 93, in_group_id 3
1338            ("sample2", "contig1", 0, 93, 4, false, 61000), // group 93, in_group_id 4
1339            ("sample2", "contig1", 1, 93, 5, false, 61000), // group 93, in_group_id 5
1340            ("sample2", "contig1", 2, 93, 6, false, 61000), // group 93, in_group_id 6
1341        ];
1342
1343        for (sample, contig, place, group_id, in_group_id, is_rev_comp, raw_len) in &test_segments {
1344            coll.add_segment_placed(
1345                sample,
1346                contig,
1347                *place,
1348                *group_id,
1349                *in_group_id,
1350                *is_rev_comp,
1351                *raw_len,
1352            )
1353            .unwrap();
1354        }
1355
1356        // Serialize the collection details
1357        let serialized = coll.serialize_contig_details(0, 2);
1358
1359        // Create a new collection and deserialize
1360        let mut coll2 = CollectionV3::new();
1361        coll2.set_config(60000, 31, None);
1362
1363        // Must set up the same sample structure
1364        coll2.register_sample_contig("sample1", "contig1").unwrap();
1365        coll2.register_sample_contig("sample1", "contig2").unwrap();
1366        coll2.register_sample_contig("sample2", "contig1").unwrap();
1367
1368        // Deserialize
1369        coll2.deserialize_contig_details(&serialized, 0).unwrap();
1370
1371        // Verify all in_group_ids match original values
1372        let sample1_contig1 = coll2.get_contig_desc("sample1", "contig1").unwrap();
1373        assert_eq!(
1374            sample1_contig1.len(),
1375            2,
1376            "sample1/contig1 should have 2 segments"
1377        );
1378        assert_eq!(
1379            sample1_contig1[0].in_group_id, 0,
1380            "sample1/contig1 segment 0 should have in_group_id=0"
1381        );
1382        assert_eq!(
1383            sample1_contig1[1].in_group_id, 1,
1384            "sample1/contig1 segment 1 should have in_group_id=1"
1385        );
1386
1387        let sample1_contig2 = coll2.get_contig_desc("sample1", "contig2").unwrap();
1388        assert_eq!(
1389            sample1_contig2.len(),
1390            2,
1391            "sample1/contig2 should have 2 segments"
1392        );
1393        assert_eq!(
1394            sample1_contig2[0].in_group_id, 2,
1395            "sample1/contig2 segment 0 should have in_group_id=2"
1396        );
1397        assert_eq!(
1398            sample1_contig2[1].in_group_id, 3,
1399            "sample1/contig2 segment 1 should have in_group_id=3"
1400        );
1401
1402        let sample2_contig1 = coll2.get_contig_desc("sample2", "contig1").unwrap();
1403        assert_eq!(
1404            sample2_contig1.len(),
1405            3,
1406            "sample2/contig1 should have 3 segments"
1407        );
1408        assert_eq!(
1409            sample2_contig1[0].in_group_id, 4,
1410            "sample2/contig1 segment 0 should have in_group_id=4"
1411        );
1412        assert_eq!(
1413            sample2_contig1[1].in_group_id, 5,
1414            "sample2/contig1 segment 1 should have in_group_id=5"
1415        );
1416        assert_eq!(
1417            sample2_contig1[2].in_group_id, 6,
1418            "sample2/contig1 segment 2 should have in_group_id=6"
1419        );
1420    }
1421}