ragc_core/
decompressor.rs

1// AGC Decompressor
2// Extracts genomes from AGC archives
3
4use crate::{
5    genome_io::{GenomeWriter, CNV_NUM},
6    lz_diff::LZDiff,
7    segment_compression::decompress_segment_with_marker,
8};
9
10use anyhow::{anyhow, Context, Result};
11use ragc_common::{
12    stream_delta_name, stream_ref_name, Archive, CollectionV3, Contig, SegmentDesc, AGC_FILE_MAJOR,
13    AGC_FILE_MINOR, CONTIG_SEPARATOR,
14};
15use std::collections::HashMap;
16use std::fs::File;
17use std::path::Path;
18
19/// Configuration for the decompressor
20#[derive(Debug, Clone)]
21pub struct DecompressorConfig {
22    pub verbosity: u32,
23}
24
25impl Default for DecompressorConfig {
26    fn default() -> Self {
27        DecompressorConfig { verbosity: 1 }
28    }
29}
30
31/// AGC Decompressor
32///
33/// Provides thread-safe read access to AGC archives.
34/// For concurrent access, use `clone_for_thread()` to create independent readers.
35///
36/// # Thread Safety
37/// - `Decompressor` is NOT `Sync` - cannot be shared between threads
38/// - Use `clone_for_thread()` to create independent readers (cheap - shares archive data)
39/// - Each clone can be used from a separate thread independently
40///
41/// # Example
42/// ```no_run
43/// use ragc_core::{Decompressor, DecompressorConfig};
44/// use std::thread;
45///
46/// let dec = Decompressor::open("data.agc", DecompressorConfig::default())?;
47/// let samples = dec.list_samples();
48///
49/// let handles: Vec<_> = samples.into_iter().map(|sample_name| {
50///     let mut thread_dec = dec.clone_for_thread().unwrap();
51///     thread::spawn(move || {
52///         thread_dec.get_sample(&sample_name)
53///     })
54/// }).collect();
55/// # Ok::<(), anyhow::Error>(())
56/// ```
57pub struct Decompressor {
58    config: DecompressorConfig,
59    archive: Archive,
60    collection: CollectionV3,
61
62    // Cached segment data (group_id -> reference segment)
63    segment_cache: HashMap<u32, Contig>,
64
65    // Archive parameters
66    _segment_size: u32,
67    pub kmer_length: u32,
68    pub min_match_len: u32,
69
70    // Archive path for cloning
71    archive_path: String,
72}
73
74impl Decompressor {
75    /// Open an existing archive for decompression
76    pub fn open(archive_path: &str, config: DecompressorConfig) -> Result<Self> {
77        let mut archive = Archive::new_reader();
78        archive
79            .open(archive_path)
80            .context("Failed to open archive for reading")?;
81
82        let mut collection = CollectionV3::new();
83
84        // Load segment_size, kmer_length, and min_match_len from params stream
85        let (segment_size, kmer_length, min_match_len) = Self::load_params(&mut archive)?;
86
87        if config.verbosity > 1 {
88            eprintln!("Loaded params: segment_size={segment_size}, kmer_length={kmer_length}");
89        }
90
91        // Configure collection
92        collection.set_config(segment_size, kmer_length, None);
93
94        // Load collection metadata
95        collection.prepare_for_decompression(&archive)?;
96        collection.load_batch_sample_names(&mut archive)?;
97
98        if config.verbosity > 0 {
99            eprintln!(
100                "Loaded archive with {} samples",
101                collection.get_no_samples()
102            );
103            let samples = collection.get_samples_list(false);
104            eprintln!("Sample names: {samples:?}");
105        }
106
107        Ok(Decompressor {
108            config,
109            archive,
110            collection,
111            segment_cache: HashMap::new(),
112            _segment_size: segment_size,
113            kmer_length,
114            min_match_len,
115            archive_path: archive_path.to_string(),
116        })
117    }
118
119    /// Load archive parameters from the params stream
120    ///
121    /// Supports multiple params formats for C++ AGC compatibility:
122    /// - 12 bytes: kmer_length, min_match_len, pack_cardinality (older C++ AGC)
123    /// - 16 bytes: kmer_length, min_match_len, pack_cardinality, segment_size (newer C++ AGC)
124    /// - 20 bytes: kmer_length, min_match_len, pack_cardinality, segment_size, no_raw_groups (ragc)
125    fn load_params(archive: &mut Archive) -> Result<(u32, u32, u32)> {
126        // Get params stream
127        let stream_id = archive
128            .get_stream_id("params")
129            .ok_or_else(|| anyhow!("params stream not found in archive"))?;
130
131        // Check that there is exactly one part
132        let num_parts = archive.get_num_parts(stream_id);
133        if num_parts != 1 {
134            anyhow::bail!("Expected 1 part in params stream, found {num_parts}");
135        }
136
137        // Read the params part
138        let (data, _metadata) = archive.get_part_by_id(stream_id, 0)?;
139
140        // Parse based on length
141        if data.len() < 12 {
142            anyhow::bail!(
143                "params stream too short: {} bytes (expected at least 12)",
144                data.len()
145            );
146        }
147
148        let kmer_length = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
149        let min_match_len = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
150        let _pack_cardinality = u32::from_le_bytes([data[8], data[9], data[10], data[11]]);
151
152        // segment_size is optional (not present in older C++ AGC archives)
153        let segment_size = if data.len() >= 16 {
154            u32::from_le_bytes([data[12], data[13], data[14], data[15]])
155        } else {
156            // Default segment_size for older archives (matches C++ AGC default)
157            60000
158        };
159
160        // no_raw_groups is optional (only in ragc archives) - ignored if present
161
162        Ok((segment_size, kmer_length, min_match_len))
163    }
164
165    /// Get list of samples in the archive
166    pub fn list_samples(&self) -> Vec<String> {
167        self.collection.get_samples_list(false)
168    }
169
170    /// Get compression statistics for all streams
171    /// Returns: Vec<(stream_name, raw_size, packed_size, num_parts)>
172    pub fn get_compression_stats(&self) -> Vec<(String, u64, u64, usize)> {
173        let mut stats = Vec::new();
174        for stream_id in 0..self.archive.get_num_streams() {
175            if let Some(name) = self.archive.get_stream_name(stream_id) {
176                let raw_size = self.archive.get_raw_size(stream_id);
177                let packed_size = self.archive.get_packed_size(stream_id);
178                let num_parts = self.archive.get_num_parts(stream_id);
179                stats.push((name.to_string(), raw_size, packed_size, num_parts));
180            }
181        }
182        stats
183    }
184
185    /// Get list of contigs for a specific sample
186    pub fn list_contigs(&mut self, sample_name: &str) -> Result<Vec<String>> {
187        if self.config.verbosity > 1 {
188            eprintln!("list_contigs called for: {sample_name}");
189            eprintln!(
190                "get_no_contigs before load: {:?}",
191                self.collection.get_no_contigs(sample_name)
192            );
193        }
194
195        // Load ALL contig batches if sample not found (samples may be in any batch)
196        if self
197            .collection
198            .get_no_contigs(sample_name)
199            .is_none_or(|count| count == 0)
200        {
201            if self.config.verbosity > 1 {
202                eprintln!("Loading all contig batches for sample: {sample_name}");
203            }
204            let num_batches = self.collection.get_no_contig_batches(&self.archive)?;
205            for batch_id in 0..num_batches {
206                self.collection
207                    .load_contig_batch(&mut self.archive, batch_id)?;
208            }
209
210            if self.config.verbosity > 1 {
211                eprintln!(
212                    "After load: get_no_contigs = {:?}",
213                    self.collection.get_no_contigs(sample_name)
214                );
215            }
216        } else if self.config.verbosity > 1 {
217            eprintln!(
218                "Contig batch already loaded, get_no_contigs = {:?}",
219                self.collection.get_no_contigs(sample_name)
220            );
221        }
222
223        let result = self.collection.get_contig_list(sample_name);
224        if self.config.verbosity > 1 {
225            eprintln!("get_contig_list result: {result:?}");
226        }
227
228        result.ok_or_else(|| anyhow!("Sample not found: {sample_name}"))
229    }
230
231    /// Extract a specific contig from a sample
232    pub fn get_contig(&mut self, sample_name: &str, contig_name: &str) -> Result<Contig> {
233        // Load ALL contig batches if sample not found (samples may be in any batch)
234        if self
235            .collection
236            .get_no_contigs(sample_name)
237            .is_none_or(|count| count == 0)
238        {
239            let num_batches = self.collection.get_no_contig_batches(&self.archive)?;
240            for batch_id in 0..num_batches {
241                self.collection
242                    .load_contig_batch(&mut self.archive, batch_id)?;
243            }
244        }
245
246        // Get contig segment descriptors
247        let segments = self
248            .collection
249            .get_contig_desc(sample_name, contig_name)
250            .ok_or_else(|| anyhow!("Contig not found: {sample_name}/{contig_name}"))?;
251
252        if self.config.verbosity > 1 {
253            eprintln!(
254                "Extracting {}/{} ({} segments)",
255                sample_name,
256                contig_name,
257                segments.len()
258            );
259            for (i, seg) in segments.iter().enumerate() {
260                eprintln!(
261                    "  Segment[{}]: group_id={}, in_group_id={}, is_rev_comp={}, raw_length={}",
262                    i, seg.group_id, seg.in_group_id, seg.is_rev_comp, seg.raw_length
263                );
264            }
265        }
266
267        // Reconstruct contig from segments
268        self.reconstruct_contig(&segments)
269    }
270
271    /// Public: get segment descriptors for a contig
272    pub fn get_contig_segments_desc(
273        &mut self,
274        sample_name: &str,
275        contig_name: &str,
276    ) -> Result<Vec<ragc_common::SegmentDesc>> {
277        // Ensure batches loaded
278        if self
279            .collection
280            .get_no_contigs(sample_name)
281            .is_none_or(|count| count == 0)
282        {
283            let num_batches = self.collection.get_no_contig_batches(&self.archive)?;
284            for batch_id in 0..num_batches {
285                self.collection
286                    .load_contig_batch(&mut self.archive, batch_id)?;
287            }
288        }
289        self.collection
290            .get_contig_desc(sample_name, contig_name)
291            .ok_or_else(|| anyhow!("Contig not found: {}/{}", sample_name, contig_name))
292    }
293
294    /// Public: get raw segment data for a specific segment descriptor
295    pub fn get_segment_data_by_desc(&mut self, desc: &ragc_common::SegmentDesc) -> Result<Contig> {
296        self.get_segment(desc)
297    }
298
299    /// Public: get the reference segment for a group (loads and caches if needed)
300    pub fn get_reference_segment(&mut self, group_id: u32) -> Result<Contig> {
301        if let Some(ref_data) = self.segment_cache.get(&group_id) {
302            return Ok(ref_data.clone());
303        }
304
305        let archive_version = ragc_common::AGC_FILE_MAJOR * 1000 + ragc_common::AGC_FILE_MINOR;
306        let ref_stream_name = stream_ref_name(archive_version, group_id);
307        let stream_id = self
308            .archive
309            .get_stream_id(&ref_stream_name)
310            .ok_or_else(|| anyhow!("Reference stream not found: {}", ref_stream_name))?;
311
312        let (mut data, metadata) = self.archive.get_part_by_id(stream_id, 0)?;
313        // Decompress if needed; metadata holds original length for packed format
314        let decompressed = if data.is_empty() {
315            Vec::new()
316        } else if data.last() == Some(&0) {
317            // Plain ZSTD stream with marker 0
318            data.pop();
319            decompress_segment_with_marker(&data, 0)?
320        } else {
321            // Tuple-packed with marker 1
322            let marker = data.pop().unwrap();
323            decompress_segment_with_marker(&data, marker)?
324        };
325
326        // Unpack 2-bit encoded reference to 1-byte bases if needed
327        // decompress_segment_with_marker returns bytes in the stored format for references
328        // Our helper already returns decompressed raw bytes for references
329        let reference = decompressed;
330        self.segment_cache.insert(group_id, reference.clone());
331        Ok(reference)
332    }
333
334    /// Extract all contigs from a sample
335    pub fn get_sample(&mut self, sample_name: &str) -> Result<Vec<(String, Contig)>> {
336        // Load ALL contig batches if needed (FIX: was only loading batch 0)
337        if self
338            .collection
339            .get_no_contigs(sample_name)
340            .is_none_or(|count| count == 0)
341        {
342            // Get number of contig batches from the archive
343            let num_batches = self.collection.get_no_contig_batches(&self.archive)?;
344
345            // Load ALL contig batches, not just batch 0
346            for batch_id in 0..num_batches {
347                self.collection
348                    .load_contig_batch(&mut self.archive, batch_id)?;
349            }
350        }
351
352        let sample_desc = self
353            .collection
354            .get_sample_desc(sample_name)
355            .ok_or_else(|| anyhow!("Sample not found: {sample_name}"))?;
356
357        let mut contigs = Vec::new();
358
359        for (contig_name, segments) in sample_desc {
360            if self.config.verbosity > 1 {
361                eprintln!(
362                    "Extracting {}/{} ({} segments)",
363                    sample_name,
364                    contig_name,
365                    segments.len()
366                );
367            }
368
369            let contig_data = self.reconstruct_contig(&segments)?;
370            contigs.push((contig_name, contig_data));
371        }
372
373        Ok(contigs)
374    }
375
376    /// Unpack 2-bit encoded data (C++ AGC format) to 1-byte-per-base format
377    /// C++ AGC stores 4 bases per byte: bits [7:6][5:4][3:2][1:0] = bases [0][1][2][3]
378    /// Each 2-bit value: 00=A=0, 01=C=1, 10=G=2, 11=T=3
379    fn unpack_2bit(packed: &[u8], expected_length: usize) -> Contig {
380        let mut unpacked = Vec::with_capacity(expected_length);
381
382        for &byte in packed {
383            // Extract 4 bases from each byte (most significant bits first)
384            unpacked.push((byte >> 6) & 0x03);
385            unpacked.push((byte >> 4) & 0x03);
386            unpacked.push((byte >> 2) & 0x03);
387            unpacked.push(byte & 0x03);
388        }
389
390        // Truncate to expected length (last byte might not use all 4 slots)
391        unpacked.truncate(expected_length);
392        unpacked
393    }
394
395    /// Apply reverse complement to a segment (reverse order and complement each base)
396    ///
397    /// C++ AGC compatibility: Only complement bases 0-3 (A, C, G, T).
398    /// IUPAC ambiguity codes (4+) are reversed but NOT complemented.
399    /// This matches C++ AGC's behavior: `(*p < 4) ? 3 - *p : *p`
400    fn reverse_complement_segment(segment: &[u8]) -> Contig {
401        segment
402            .iter()
403            .rev()
404            .map(|&base| {
405                if base < 4 {
406                    3 - base // Complement: A(0)<->T(3), C(1)<->G(2)
407                } else {
408                    base // Keep IUPAC codes unchanged (just reverse position)
409                }
410            })
411            .collect()
412    }
413
414    /// Reconstruct a contig from its segment descriptors
415    fn reconstruct_contig(&mut self, segments: &[SegmentDesc]) -> Result<Contig> {
416        let mut contig = Contig::new();
417        #[cfg(feature = "verbose_debug")]
418        let should_debug = crate::env_cache::debug_reconstruct() && segments.len() > 1;
419        #[cfg(not(feature = "verbose_debug"))]
420        let should_debug = false;
421
422        if should_debug {
423            eprintln!(
424                "DEBUG_RECONSTRUCT: {} segments, k={}",
425                segments.len(),
426                self.kmer_length
427            );
428        }
429
430        for (i, segment_desc) in segments.iter().enumerate() {
431            let mut segment_data = self.get_segment(segment_desc)?;
432
433            // Apply reverse complement if the segment was stored in reverse orientation
434            if segment_desc.is_rev_comp {
435                segment_data = Self::reverse_complement_segment(&segment_data);
436            }
437
438            if i == 0 {
439                // First segment: add completely
440                if should_debug {
441                    let last_k: Vec<u8> = segment_data
442                        .iter()
443                        .rev()
444                        .take(self.kmer_length as usize)
445                        .rev()
446                        .copied()
447                        .collect();
448                    eprintln!(
449                        "  Seg {}: len={} (added fully), last_{}_bytes={:?}, contig_len={}",
450                        i,
451                        segment_data.len(),
452                        self.kmer_length,
453                        last_k,
454                        contig.len() + segment_data.len()
455                    );
456                }
457                contig.extend_from_slice(&segment_data);
458            } else {
459                // Subsequent segments: skip first k bases (full k-mer overlap)
460                // Each segment includes the FULL k-mer at the start
461                let overlap = self.kmer_length as usize;
462                if segment_data.len() < overlap {
463                    eprintln!(
464                        "ERROR: Segment {} too short! Length={}, overlap={}",
465                        i,
466                        segment_data.len(),
467                        overlap
468                    );
469                    eprintln!(
470                        "  Segment desc: group_id={}, in_group_id={}, raw_length={}",
471                        segment_desc.group_id, segment_desc.in_group_id, segment_desc.raw_length
472                    );
473                    anyhow::bail!("Corrupted archive: segment too short (segment {}, got {} bytes, need at least {} bytes)", i, segment_data.len(), overlap);
474                }
475                let added = segment_data.len() - overlap;
476                let first_k: Vec<u8> = segment_data.iter().take(overlap).copied().collect();
477                if should_debug {
478                    eprintln!(
479                        "  Seg {}: len={} (skip {}), first_{}_bytes={:?}, added={}, contig_len={}",
480                        i,
481                        segment_data.len(),
482                        overlap,
483                        overlap,
484                        first_k,
485                        added,
486                        contig.len() + added
487                    );
488                }
489                contig.extend_from_slice(&segment_data[overlap..]);
490            }
491        }
492
493        if should_debug {
494            eprintln!("  Final contig length: {}", contig.len());
495        }
496
497        Ok(contig)
498    }
499
500    /// Unpack contigs from packed data by splitting on CONTIG_SEPARATOR (0xFF)
501    fn unpack_contig(packed_data: &[u8], position_in_pack: usize) -> Result<Contig> {
502        let mut contig_start = 0;
503        let mut current_position = 0;
504
505        for (i, &byte) in packed_data.iter().enumerate() {
506            if byte == CONTIG_SEPARATOR {
507                if current_position == position_in_pack {
508                    // Found the contig we're looking for
509                    return Ok(packed_data[contig_start..i].to_vec());
510                }
511                current_position += 1;
512                contig_start = i + 1;
513            }
514        }
515
516        // Handle last contig (if no trailing separator or position is last)
517        if current_position == position_in_pack {
518            return Ok(packed_data[contig_start..].to_vec());
519        }
520
521        anyhow::bail!(
522            "Position {} not found in packed data (only {} contigs found)",
523            position_in_pack,
524            current_position + 1
525        )
526    }
527
528    /// Get a single segment (handles reference and LZ diff decoding)
529    /// Supports packed-contig mode where multiple contigs are stored in one part
530    ///
531    /// Two-stream architecture:
532    /// - Raw groups (0-15): All segments in delta stream
533    /// - LZ groups (16+): Reference in ref stream (part 0), LZ-encoded segments in delta stream
534    fn get_segment(&mut self, desc: &SegmentDesc) -> Result<Contig> {
535        const PACK_CARDINALITY: usize = 50; // C++ default
536        const NO_RAW_GROUPS: u32 = 16;
537
538        let archive_version = AGC_FILE_MAJOR * 1000 + AGC_FILE_MINOR;
539
540        if self.config.verbosity > 1 {
541            eprintln!(
542                "get_segment: group_id={}, in_group_id={}, raw_length={}",
543                desc.group_id, desc.in_group_id, desc.raw_length
544            );
545        }
546
547        // Handle LZ groups (>= 16) differently from raw groups (< 16)
548        if desc.group_id >= NO_RAW_GROUPS {
549            // LZ group: two-stream architecture
550
551            // First, ensure we have the reference loaded
552            if !self.segment_cache.contains_key(&desc.group_id) {
553                // Load reference from ref stream (part 0)
554                let ref_stream_name = stream_ref_name(archive_version, desc.group_id);
555                let ref_stream_id = self
556                    .archive
557                    .get_stream_id(&ref_stream_name)
558                    .ok_or_else(|| anyhow!("Reference stream not found: {ref_stream_name}"))?;
559
560                let (mut ref_data, ref_metadata) = self.archive.get_part_by_id(ref_stream_id, 0)?;
561
562                // Decompress if needed
563                let mut decompressed_ref = if ref_metadata == 0 {
564                    ref_data
565                } else {
566                    if ref_data.is_empty() {
567                        anyhow::bail!("Empty compressed reference data");
568                    }
569                    let marker = ref_data.pop().unwrap();
570                    decompress_segment_with_marker(&ref_data, marker)?
571                };
572
573                // Check if data is in 2-bit packed format (C++ AGC compatibility)
574                // If decompressed length is ~1/4 of raw_length, it's packed
575                // Allow small tolerance for rounding (last byte can encode 1-4 bases)
576                //
577                // CRITICAL: Use ref_metadata (original reference size) NOT desc.raw_length (requested segment size)
578                // When ref_metadata != 0, it contains the original uncompressed size of the REFERENCE.
579                // desc.raw_length is the size of the segment being requested, which may differ from the reference.
580                let expected_ref_len = if ref_metadata != 0 {
581                    ref_metadata as usize
582                } else {
583                    // If ref_metadata == 0, the reference was stored uncompressed, so use actual size
584                    decompressed_ref.len()
585                };
586
587                let is_packed = decompressed_ref.len() * 4 >= expected_ref_len
588                    && decompressed_ref.len() * 4 < expected_ref_len + 8;
589
590                if self.config.verbosity > 2 {
591                    eprintln!(
592                        "  DEBUG: decompressed_len={}, expected_ref_len={}, decompressed*4={}, is_packed={}",
593                        decompressed_ref.len(),
594                        expected_ref_len,
595                        decompressed_ref.len() * 4,
596                        is_packed
597                    );
598                }
599
600                if is_packed {
601                    // Unpack from 2-bit format to 1-byte-per-base format
602                    decompressed_ref = Self::unpack_2bit(&decompressed_ref, expected_ref_len);
603
604                    if self.config.verbosity > 1 {
605                        eprintln!(
606                            "Loaded & unpacked reference for group {}: length={} (was {} bytes packed)",
607                            desc.group_id,
608                            decompressed_ref.len(),
609                            decompressed_ref.len() / 4
610                        );
611                    }
612                } else if self.config.verbosity > 1 {
613                    eprintln!(
614                        "Loaded reference for group {}: length={} (already unpacked)",
615                        desc.group_id,
616                        decompressed_ref.len()
617                    );
618                }
619
620                // Cache the reference
621                self.segment_cache.insert(desc.group_id, decompressed_ref);
622            }
623
624            // If this IS the reference (in_group_id == 0), return it directly
625            if desc.in_group_id == 0 {
626                return Ok(self.segment_cache.get(&desc.group_id).unwrap().clone());
627            }
628
629            // Otherwise, load LZ-encoded segment from delta stream
630            // For delta stream: in_group_id 1 is at position 0, in_group_id 2 is at position 1, etc.
631            let delta_position = (desc.in_group_id - 1) as usize;
632            let pack_id = delta_position / PACK_CARDINALITY;
633            let position_in_pack = delta_position % PACK_CARDINALITY;
634
635            if self.config.verbosity > 1 {
636                eprintln!(
637                    "  LZ group: delta_position={delta_position}, pack_id={pack_id}, position_in_pack={position_in_pack}"
638                );
639            }
640
641            let delta_stream_name = stream_delta_name(archive_version, desc.group_id);
642            let delta_stream_id =
643                self.archive
644                    .get_stream_id(&delta_stream_name)
645                    .ok_or_else(|| {
646                        eprintln!("ERROR: Delta stream not found: {}", delta_stream_name);
647                        eprintln!(
648                            "  Requested by segment: group_id={}, in_group_id={}, raw_length={}",
649                            desc.group_id, desc.in_group_id, desc.raw_length
650                        );
651                        anyhow!("Delta stream not found: {delta_stream_name}")
652                    })?;
653
654            // Check how many parts this stream has
655            let num_parts = self.archive.get_num_parts(delta_stream_id);
656
657            if pack_id >= num_parts {
658                anyhow::bail!("Pack ID {} out of range (stream has only {} parts). in_group_id={}, delta_position={}",
659                    pack_id, num_parts, desc.in_group_id, delta_position);
660            }
661
662            let (mut delta_data, delta_metadata) =
663                self.archive.get_part_by_id(delta_stream_id, pack_id)?;
664
665            // Decompress if needed
666            let decompressed_pack = if delta_metadata == 0 {
667                delta_data
668            } else {
669                if delta_data.is_empty() {
670                    anyhow::bail!("Empty compressed delta data");
671                }
672                let marker = delta_data.pop().unwrap();
673                decompress_segment_with_marker(&delta_data, marker)?
674            };
675
676            // Unpack the specific LZ-encoded segment
677            let lz_encoded = Self::unpack_contig(&decompressed_pack, position_in_pack)?;
678
679            if self.config.verbosity > 1 {
680                eprintln!("Unpacked LZ-encoded segment: length={}", lz_encoded.len());
681            }
682
683            // Decode using reference
684            let reference = self
685                .segment_cache
686                .get(&desc.group_id)
687                .ok_or_else(|| anyhow!("Reference not loaded for group {}", desc.group_id))?;
688
689            let mut lz_diff = LZDiff::new(self.min_match_len);
690            lz_diff.prepare(reference);
691
692            // Handle empty encoding (segment equals reference)
693            let decoded = if lz_encoded.is_empty() {
694                reference.clone()
695            } else {
696                lz_diff.decode(&lz_encoded)
697            };
698
699            // Debug: check for corruption pattern (AAAA = zeros)
700            if crate::env_cache::debug_decode() {
701                // Critical: check if decoded length mismatches raw_length
702                if decoded.len() as u32 != desc.raw_length {
703                    eprintln!("RAGC_DEBUG_DECODE_MISMATCH: group={} in_group={} raw_len={} decoded_len={} ref_len={} lz_encoded_len={}",
704                        desc.group_id, desc.in_group_id, desc.raw_length, decoded.len(), reference.len(), lz_encoded.len());
705                    eprintln!(
706                        "  LZ encoded bytes (first 100): {:?}",
707                        &lz_encoded[..lz_encoded.len().min(100)]
708                    );
709
710                    // Show first few bytes of decoded and expected
711                    let dec_start: Vec<u8> = decoded.iter().take(30).copied().collect();
712                    let ref_start: Vec<u8> = reference.iter().take(30).copied().collect();
713                    eprintln!("  Decoded[0:30]={:?}", dec_start);
714                    eprintln!("  Reference[0:30]={:?}", ref_start);
715                }
716            }
717
718            if self.config.verbosity > 1 {
719                eprintln!("Decoded segment: length={}", decoded.len());
720            }
721
722            Ok(decoded)
723        } else {
724            // Raw group (0-15): NO reference stream, ALL data in delta stream
725            // This matches C++ AGC's segment.cpp get_raw() function:
726            // - part_id = id_seq / contigs_in_pack
727            // - seq_in_part_id = id_seq % contigs_in_pack
728            // Data is stored raw (not LZ-encoded), separated by contig_separator (0x7f)
729
730            let pack_id = desc.in_group_id as usize / PACK_CARDINALITY;
731            let position_in_pack = desc.in_group_id as usize % PACK_CARDINALITY;
732
733            if self.config.verbosity > 1 {
734                eprintln!(
735                    "  Raw group {}: in_group_id={}, pack_id={}, position_in_pack={}",
736                    desc.group_id, desc.in_group_id, pack_id, position_in_pack
737                );
738            }
739
740            let stream_name = stream_delta_name(archive_version, desc.group_id);
741            let stream_id = self
742                .archive
743                .get_stream_id(&stream_name)
744                .ok_or_else(|| anyhow!("Delta stream not found: {stream_name}"))?;
745
746            let (mut data, metadata) = self.archive.get_part_by_id(stream_id, pack_id)?;
747
748            // Decompress if needed
749            let decompressed_pack = if metadata == 0 {
750                data
751            } else {
752                if data.is_empty() {
753                    anyhow::bail!("Empty compressed data for raw group");
754                }
755                let marker = data.pop().unwrap();
756                decompress_segment_with_marker(&data, marker)?
757            };
758
759            // Unpack the raw segment (segments separated by 0x7f)
760            let contig_data = Self::unpack_contig(&decompressed_pack, position_in_pack)?;
761
762            if self.config.verbosity > 1 {
763                eprintln!("Unpacked raw segment: length={}", contig_data.len());
764            }
765
766            Ok(contig_data)
767        }
768    }
769
770    /// Get all sample names that match a given prefix
771    ///
772    /// This is useful for extracting all samples from a specific genome or haplotype.
773    ///
774    /// # Example
775    /// ```no_run
776    /// use ragc_core::{Decompressor, DecompressorConfig};
777    ///
778    /// let dec = Decompressor::open("data.agc", DecompressorConfig::default())?;
779    ///
780    /// // Get all samples starting with "AAA"
781    /// let aaa_samples = dec.list_samples_with_prefix("AAA");
782    /// // Returns: ["AAA#0", "AAA#1", ...] if they exist
783    ///
784    /// // Get all samples with haplotype 0
785    /// let hap0_samples = dec.list_samples_with_prefix("AAA#0");
786    /// # Ok::<(), anyhow::Error>(())
787    /// ```
788    pub fn list_samples_with_prefix(&self, prefix: &str) -> Vec<String> {
789        self.list_samples()
790            .into_iter()
791            .filter(|s| s.starts_with(prefix))
792            .collect()
793    }
794
795    /// Extract multiple samples matching a prefix
796    ///
797    /// Returns a HashMap mapping sample names to their contigs.
798    /// Each contig is represented as (contig_name, sequence).
799    ///
800    /// # Example
801    /// ```no_run
802    /// use ragc_core::{Decompressor, DecompressorConfig};
803    ///
804    /// let mut dec = Decompressor::open("data.agc", DecompressorConfig::default())?;
805    ///
806    /// // Extract all AAA samples
807    /// let aaa_samples = dec.get_samples_by_prefix("AAA")?;
808    ///
809    /// for (sample_name, contigs) in aaa_samples {
810    ///     println!("Sample {}: {} contigs", sample_name, contigs.len());
811    ///     for (contig_name, sequence) in contigs {
812    ///         println!("  {}: {} bp", contig_name, sequence.len());
813    ///     }
814    /// }
815    /// # Ok::<(), anyhow::Error>(())
816    /// ```
817    pub fn get_samples_by_prefix(
818        &mut self,
819        prefix: &str,
820    ) -> Result<HashMap<String, Vec<(String, Contig)>>> {
821        let sample_names = self.list_samples_with_prefix(prefix);
822        let mut results = HashMap::new();
823
824        for sample_name in sample_names {
825            let contigs = self.get_sample(&sample_name)?;
826            results.insert(sample_name, contigs);
827        }
828
829        Ok(results)
830    }
831
832    /// Clone this decompressor for use in another thread
833    ///
834    /// This creates an independent reader that can be used from a different thread.
835    /// The clone is lightweight as it shares the archive data structure.
836    ///
837    /// # Thread Safety
838    /// Each clone is fully independent and can be used from a separate thread.
839    /// The segment cache is NOT shared between clones.
840    ///
841    /// # Example
842    /// ```no_run
843    /// use ragc_core::{Decompressor, DecompressorConfig};
844    /// use std::thread;
845    ///
846    /// let dec = Decompressor::open("data.agc", DecompressorConfig::default())?;
847    /// let samples = dec.list_samples();
848    ///
849    /// // Spawn threads to extract samples in parallel
850    /// let handles: Vec<_> = samples.into_iter().map(|sample_name| {
851    ///     let mut thread_dec = dec.clone_for_thread().unwrap();
852    ///     thread::spawn(move || {
853    ///         thread_dec.get_sample(&sample_name)
854    ///     })
855    /// }).collect();
856    ///
857    /// // Collect results
858    /// for handle in handles {
859    ///     let result = handle.join().unwrap()?;
860    ///     // Process result...
861    /// }
862    /// # Ok::<(), anyhow::Error>(())
863    /// ```
864    pub fn clone_for_thread(&self) -> Result<Self> {
865        // Re-open the archive (cheap - just re-reads metadata)
866        // Archive data is memory-mapped or cached by the OS
867        Self::open(&self.archive_path, self.config.clone())
868    }
869
870    /// Write a sample to a FASTA file
871    pub fn write_sample_fasta(&mut self, sample_name: &str, output_path: &Path) -> Result<()> {
872        let contigs = self.get_sample(sample_name)?;
873
874        let mut writer = GenomeWriter::<File>::create(output_path)?;
875
876        for (contig_name, contig_data) in contigs {
877            // Convert numeric encoding back to ASCII using CNV_NUM lookup table
878            // CNV_NUM[0..16] = [A, C, G, T, N, R, Y, S, W, K, M, B, D, H, V, U]
879            // This preserves all IUPAC ambiguity codes, not just ACGT
880            let mut ascii_data = Vec::with_capacity(contig_data.len());
881            for &base in &contig_data {
882                // Use CNV_NUM[0..16] for proper IUPAC code decoding
883                // Values 0-15 map to valid bases, anything else (shouldn't happen) maps to N
884                let ascii_base = if base < 16 {
885                    CNV_NUM[base as usize]
886                } else {
887                    b'N'
888                };
889                ascii_data.push(ascii_base);
890            }
891
892            writer.save_contig_directly(&contig_name, &ascii_data, 0)?;
893        }
894
895        Ok(())
896    }
897
898    /// Get archive inspection data for debugging/comparison
899    /// Returns: (group_id, num_segments, num_reference_segments, num_delta_segments)
900    pub fn get_group_statistics(&mut self) -> Result<Vec<(u32, usize, usize, usize)>> {
901        use std::collections::HashMap;
902
903        // Load ALL contig batches to get complete segment information
904        let num_batches = self.collection.get_no_contig_batches(&self.archive)?;
905        for batch_id in 0..num_batches {
906            self.collection
907                .load_contig_batch(&mut self.archive, batch_id)?;
908        }
909
910        let sample_names = self.list_samples();
911
912        // Now collect statistics about groups
913        let mut group_stats: HashMap<u32, (usize, usize, usize)> = HashMap::new();
914
915        for sample_name in &sample_names {
916            let contig_list = self
917                .collection
918                .get_contig_list(sample_name)
919                .ok_or_else(|| anyhow!("Failed to get contig list for {}", sample_name))?;
920
921            for contig_name in &contig_list {
922                if let Some(segments) = self.collection.get_contig_desc(sample_name, contig_name) {
923                    for seg in segments {
924                        let entry = group_stats.entry(seg.group_id).or_insert((0, 0, 0));
925                        entry.0 += 1; // total segments
926                                      // Raw groups (0-15) have NO references - all segments are raw/delta
927                                      // LZ groups (16+) have references at in_group_id==0
928                        if seg.group_id >= 16 && seg.in_group_id == 0 {
929                            entry.1 += 1; // reference segments (LZ groups only)
930                        } else {
931                            entry.2 += 1; // delta/raw segments
932                        }
933                    }
934                }
935            }
936        }
937
938        let mut stats: Vec<_> = group_stats
939            .into_iter()
940            .map(|(gid, (total, refs, deltas))| (gid, total, refs, deltas))
941            .collect();
942        stats.sort_by_key(|s| s.0);
943
944        Ok(stats)
945    }
946
947    /// Get detailed segment information for all contigs
948    /// Returns: Vec<(sample_name, contig_name, Vec<SegmentDesc>)>
949    pub fn get_all_segments(&mut self) -> Result<Vec<(String, String, Vec<SegmentDesc>)>> {
950        // Load ALL contig batches
951        let num_batches = self.collection.get_no_contig_batches(&self.archive)?;
952        for batch_id in 0..num_batches {
953            self.collection
954                .load_contig_batch(&mut self.archive, batch_id)?;
955        }
956
957        let sample_names = self.list_samples();
958        let mut all_segments = Vec::new();
959
960        for sample_name in &sample_names {
961            let contig_list = self
962                .collection
963                .get_contig_list(sample_name)
964                .ok_or_else(|| anyhow!("Failed to get contig list for {}", sample_name))?;
965
966            for contig_name in &contig_list {
967                if let Some(segments) = self.collection.get_contig_desc(sample_name, contig_name) {
968                    all_segments.push((
969                        sample_name.clone(),
970                        contig_name.clone(),
971                        segments.to_vec(),
972                    ));
973                }
974            }
975        }
976
977        Ok(all_segments)
978    }
979
980    /// Close the archive
981    pub fn close(mut self) -> Result<()> {
982        self.archive.close()
983    }
984}