ragc_core/
lz_diff.rs

1// LZ Diff Encoding
2// Encodes a target sequence as differences from a reference sequence
3
4#![allow(clippy::same_item_push)]
5
6use ragc_common::{hash::MurMur64Hash, types::Contig};
7use std::sync::atomic::{AtomicU64, Ordering};
8
9// Debug counters for comparing with C++ AGC
10static TOTAL_MATCHES: AtomicU64 = AtomicU64::new(0);
11static TOTAL_LITERALS: AtomicU64 = AtomicU64::new(0);
12static TOTAL_BANG_REPLACEMENTS: AtomicU64 = AtomicU64::new(0);
13static TOTAL_ENCODED_BYTES: AtomicU64 = AtomicU64::new(0);
14static TOTAL_NRUNS: AtomicU64 = AtomicU64::new(0);
15
16/// Print LZ encoding statistics (call at end of compression)
17pub fn print_lz_stats() {
18    eprintln!(
19        "LZ_STATS: matches={} literals={} bang_replacements={} nruns={} encoded_bytes={}",
20        TOTAL_MATCHES.load(Ordering::Relaxed),
21        TOTAL_LITERALS.load(Ordering::Relaxed),
22        TOTAL_BANG_REPLACEMENTS.load(Ordering::Relaxed),
23        TOTAL_NRUNS.load(Ordering::Relaxed),
24        TOTAL_ENCODED_BYTES.load(Ordering::Relaxed)
25    );
26}
27
28/// Constants for LZ diff encoding
29const N_CODE: u8 = 4;
30const N_RUN_STARTER_CODE: u8 = 30;
31const MIN_NRUN_LEN: u32 = 4;
32const MAX_NO_TRIES: usize = 64;
33const HASHING_STEP: usize = 4; // USE_SPARSE_HT mode
34
35/// LZ Diff encoder/decoder (V2 implementation)
36pub struct LZDiff {
37    reference: Vec<u8>,
38    reference_len: usize, // Original length before padding
39    // Linear-probing table for exact coding-cost matching with C++
40    ht_lp: Vec<u32>, // stores (i / HASHING_STEP) or u32::MAX for empty
41    ht_mask: u64,
42    min_match_len: u32,
43    key_len: u32,
44    key_mask: u64,
45}
46
47impl LZDiff {
48    /// Create a new LZ diff encoder with the given minimum match length
49    pub fn new(min_match_len: u32) -> Self {
50        let key_len = min_match_len - (HASHING_STEP as u32) + 1;
51        let key_mask = if key_len >= 32 {
52            !0u64
53        } else {
54            (1u64 << (2 * key_len)) - 1
55        };
56
57        LZDiff {
58            reference: Vec::new(),
59            reference_len: 0,
60            ht_lp: Vec::new(),
61            ht_mask: 0,
62            min_match_len,
63            key_len,
64            key_mask,
65        }
66    }
67
68    /// Prepare the encoder with a reference sequence
69    pub fn prepare(&mut self, reference: &Contig) {
70        let debug_lz = crate::env_cache::debug_lz_enabled();
71
72        self.reference = reference.clone();
73        self.reference_len = reference.len(); // Store original length before padding
74                                              // Add padding for key_len
75        self.reference
76            .resize(self.reference.len() + self.key_len as usize, 31);
77
78        self.build_index_lp();
79
80        if debug_lz {
81            eprintln!(
82                "RAGC_LZ_PREPARE: ref_len={} key_len={} ht_lp_size={} ht_mask={:#x}",
83                self.reference_len,
84                self.key_len,
85                self.ht_lp.len(),
86                self.ht_mask
87            );
88            // Count non-empty slots
89            let filled = self.ht_lp.iter().filter(|&&x| x != u32::MAX).count();
90            eprintln!(
91                "RAGC_LZ_PREPARE: ht_lp filled={}/{} ({:.1}%)",
92                filled,
93                self.ht_lp.len(),
94                100.0 * filled as f64 / self.ht_lp.len() as f64
95            );
96        }
97    }
98
99    /// Build linear-probing hash table (exactly like C++ CLZDiffBase::make_index32)
100    fn build_index_lp(&mut self) {
101        // Count valid k-mer positions as in C++ prepare_index (sparse mode)
102        let mut ht_size: u64 = 0;
103        let mut no_prev_valid: u32 = 0;
104        let mut cnt_mod: u32 = 0;
105        let key_len_mod: u32 = self.key_len % (HASHING_STEP as u32);
106        for &c in &self.reference {
107            if c < 4 {
108                no_prev_valid += 1;
109            } else {
110                no_prev_valid = 0;
111            }
112            cnt_mod += 1;
113            if cnt_mod == HASHING_STEP as u32 {
114                cnt_mod = 0;
115            }
116            if cnt_mod == key_len_mod && no_prev_valid >= self.key_len {
117                ht_size += 1;
118            }
119        }
120
121        // Adjust size by load factor (0.7) and round to power of two then double
122        let mut ht_size = (ht_size as f64 / 0.7) as u64;
123        if ht_size == 0 {
124            ht_size = 1;
125        }
126        while (ht_size & (ht_size - 1)) != 0 {
127            ht_size &= ht_size - 1;
128        }
129        ht_size <<= 1;
130        if ht_size < 8 {
131            ht_size = 8;
132        }
133
134        self.ht_mask = ht_size - 1;
135        self.ht_lp.clear();
136        self.ht_lp.resize(ht_size as usize, u32::MAX);
137
138        // Insert positions with linear probing (sparse step)
139        let ref_len = self.reference.len();
140        let mut i = 0usize;
141        while i + (self.key_len as usize) < ref_len {
142            if let Some(code) = self.get_code(&self.reference[i..]) {
143                let base = (MurMur64Hash::hash(code) & self.ht_mask) as usize;
144                for j in 0..MAX_NO_TRIES {
145                    let idx = (base + j) & (self.ht_mask as usize);
146                    if self.ht_lp[idx] == u32::MAX {
147                        self.ht_lp[idx] = (i / HASHING_STEP) as u32;
148                        break;
149                    }
150                }
151            }
152            i += HASHING_STEP;
153        }
154    }
155
156    /// Extract k-mer code from sequence
157    #[allow(clippy::needless_range_loop)]
158    fn get_code(&self, seq: &[u8]) -> Option<u64> {
159        let mut code = 0u64;
160        for i in 0..(self.key_len as usize) {
161            if seq[i] > 3 {
162                return None; // Invalid base (N or other)
163            }
164            code = (code << 2) | (seq[i] as u64);
165        }
166        Some(code)
167    }
168
169    /// Extract k-mer code using sliding window optimization
170    fn get_code_skip1(&self, prev_code: u64, seq: &[u8]) -> Option<u64> {
171        let last_base_idx = (self.key_len as usize) - 1;
172        if seq[last_base_idx] > 3 {
173            return None;
174        }
175        let code = ((prev_code << 2) & self.key_mask) | (seq[last_base_idx] as u64);
176        Some(code)
177    }
178
179    /// Check for N-run (at least 3 consecutive N bases)
180    fn get_nrun_len(&self, seq: &[u8], max_len: usize) -> u32 {
181        if seq.len() < 3 || seq[0] != N_CODE || seq[1] != N_CODE || seq[2] != N_CODE {
182            return 0;
183        }
184
185        let mut len = 3;
186        while len < max_len && seq[len] == N_CODE {
187            len += 1;
188        }
189        len as u32
190    }
191
192    /// Encode a literal base
193    fn encode_literal(&self, base: u8, encoded: &mut Vec<u8>) {
194        encoded.push(b'A' + base);
195    }
196
197    /// Encode an N-run
198    fn encode_nrun(&self, len: u32, encoded: &mut Vec<u8>) {
199        encoded.push(N_RUN_STARTER_CODE);
200        self.append_int(encoded, (len - MIN_NRUN_LEN) as i64);
201        encoded.push(N_CODE);
202    }
203
204    /// Encode a match
205    /// Format: <pos>,<len>. or <pos>. (comma ONLY when length is present)
206    /// Match-to-end: len=None → "pos." (no comma, matching C++ AGC lz_diff.cpp:637-641)
207    /// Normal match: len=Some → "pos,len." (with comma)
208    fn encode_match(&self, ref_pos: u32, len: Option<u32>, pred_pos: u32, encoded: &mut Vec<u8>) {
209        let dif_pos = (ref_pos as i32) - (pred_pos as i32);
210        self.append_int(encoded, dif_pos as i64);
211
212        // C++ AGC V2 format: comma ONLY if length is present (not match-to-end)
213        // See lz_diff.cpp lines 637-641: if (len != ~0u) { encoded.emplace_back(','); ... }
214        if let Some(match_len) = len {
215            encoded.push(b',');
216            self.append_int(encoded, (match_len - self.min_match_len) as i64);
217        }
218
219        encoded.push(b'.');
220    }
221
222    /// Append integer as ASCII decimal
223    fn append_int(&self, text: &mut Vec<u8>, mut x: i64) {
224        if x == 0 {
225            text.push(b'0');
226            return;
227        }
228
229        if x < 0 {
230            text.push(b'-');
231            x = -x;
232        }
233
234        // Write digits directly to output (in reverse), then reverse just that portion
235        let start_pos = text.len();
236        while x > 0 {
237            text.push(b'0' + (x % 10) as u8);
238            x /= 10;
239        }
240
241        // Reverse just the digits we added
242        text[start_pos..].reverse();
243    }
244
245    /// Find best match using linear-probing table (exactly like C++ for cost vectors)
246    fn find_best_match_lp(
247        &self,
248        kmer_code: u64,
249        hash: u64,
250        target: &[u8],
251        text_pos: usize,
252        max_len: usize,
253        no_prev_literals: usize,
254    ) -> Option<(u32, u32, u32)> {
255        let debug_lz = crate::env_cache::debug_lz_enabled();
256        if self.ht_lp.is_empty() {
257            return None;
258        }
259
260        let mut best_ref_pos = 0u32;
261        let mut best_len_bck = 0u32;
262        let mut best_len_fwd = 0u32;
263        let mut min_to_update = self.min_match_len as usize;
264
265        let ht_pos = (hash & self.ht_mask) as usize;
266        let mut probes = 0usize;
267        let mut found_match = false;
268
269        for j in 0..MAX_NO_TRIES {
270            let idx = (ht_pos + j) & (self.ht_mask as usize);
271            let slot = self.ht_lp[idx];
272            probes += 1;
273            if slot == u32::MAX {
274                break;
275            }
276            found_match = true;
277
278            let h_pos = (slot as usize) * HASHING_STEP;
279            if h_pos >= self.reference.len() {
280                continue;
281            }
282
283            // CRITICAL: Verify the k-mer actually matches (not just hash collision)
284            if let Some(ref_kmer_code) = self.get_code(&self.reference[h_pos..]) {
285                if ref_kmer_code != kmer_code {
286                    // Hash collision - this position has a different k-mer
287                    if debug_lz && text_pos < 10 && probes < 3 {
288                        eprintln!(
289                            "RAGC_LZ_COLLISION: text_pos={} probe={} h_pos={} kmer_mismatch",
290                            text_pos, j, h_pos
291                        );
292                    }
293                    continue;
294                }
295            } else {
296                // Invalid k-mer at this position (contains N)
297                if debug_lz && text_pos < 10 && probes < 3 {
298                    eprintln!(
299                        "RAGC_LZ_INVALID_KMER: text_pos={} probe={} h_pos={} contains_N",
300                        text_pos, j, h_pos
301                    );
302                }
303                continue;
304            }
305
306            let ref_ptr = &self.reference[h_pos..];
307            let text_ptr = &target[text_pos..];
308            let f_len = Self::matching_length(text_ptr, ref_ptr, max_len);
309
310            if debug_lz && text_pos < 5 && j == 0 {
311                // Show the actual k-mer (key_len bytes) being compared
312                let kmer_len = self.key_len as usize;
313                let ref_kmer: String = ref_ptr
314                    .iter()
315                    .take(kmer_len)
316                    .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
317                    .collect();
318                let tgt_kmer: String = text_ptr
319                    .iter()
320                    .take(kmer_len)
321                    .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
322                    .collect();
323                let kmer_match = ref_ptr
324                    .iter()
325                    .zip(text_ptr.iter())
326                    .take(kmer_len)
327                    .all(|(a, b)| a == b);
328
329                eprintln!(
330                    "RAGC_LZ_KMER: text_pos={} h_pos={} kmer_match={} f_len={}",
331                    text_pos, h_pos, kmer_match, f_len
332                );
333                eprintln!("  ref_kmer[{}]: {}", kmer_len, ref_kmer);
334                eprintln!("  tgt_kmer[{}]: {}", kmer_len, tgt_kmer);
335            }
336
337            if f_len >= self.key_len as usize {
338                let mut b_len = 0usize;
339                let max_back = no_prev_literals.min(h_pos).min(text_pos);
340                while b_len < max_back {
341                    if target[text_pos - b_len - 1] != self.reference[h_pos - b_len - 1] {
342                        break;
343                    }
344                    b_len += 1;
345                }
346                if b_len + f_len > min_to_update {
347                    best_len_bck = b_len as u32;
348                    best_len_fwd = f_len as u32;
349                    best_ref_pos = h_pos as u32;
350                    min_to_update = b_len + f_len;
351                } else if debug_lz && text_pos < 10 && probes < 3 {
352                    eprintln!("RAGC_LZ_TOO_SHORT: text_pos={} probe={} h_pos={} total_len={} < min_to_update={}",
353                        text_pos, j, h_pos, b_len + f_len, min_to_update);
354                }
355            } else if debug_lz && text_pos < 10 && probes < 3 {
356                eprintln!(
357                    "RAGC_LZ_FLEN_SHORT: text_pos={} probe={} h_pos={} f_len={} < key_len={}",
358                    text_pos, j, h_pos, f_len, self.key_len
359                );
360            }
361        }
362
363        if debug_lz && text_pos < 100 {
364            eprintln!(
365                "RAGC_LZ_LOOKUP: text_pos={} probes={} found_slot={} best_len={}",
366                text_pos,
367                probes,
368                found_match,
369                best_len_bck + best_len_fwd
370            );
371        }
372
373        if (best_len_bck + best_len_fwd) as usize >= self.min_match_len as usize {
374            Some((best_ref_pos, best_len_bck, best_len_fwd))
375        } else {
376            None
377        }
378    }
379
380    /// Count matching length between two sequences
381    fn matching_length(s1: &[u8], s2: &[u8], max_len: usize) -> usize {
382        let mut len = 0;
383        let max = max_len.min(s1.len()).min(s2.len());
384        while len < max && s1[len] == s2[len] {
385            len += 1;
386        }
387        len
388    }
389
390    /// Encode target sequence relative to reference
391    pub fn encode(&mut self, target: &Contig) -> Vec<u8> {
392        // Pre-allocate capacity to avoid repeated reallocations
393        // Typical LZ compression achieves 2-4:1, so estimate capacity as target_len / 2
394        let mut encoded = Vec::with_capacity(target.len() / 2);
395
396        // Debug logging (only if RAGC_DEBUG_LZ=1)
397        let debug_lz = crate::env_cache::debug_lz_enabled();
398        let mut match_count = 0u32;
399        let mut literal_count = 0u32;
400        let mut bang_count = 0u32;
401        let mut total_match_len = 0u64;
402        let mut nrun_count = 0u32;
403
404        // Optimization: if target equals reference, return empty
405        if target.len() == self.reference_len
406            && target
407                .iter()
408                .zip(self.reference.iter())
409                .all(|(a, b)| a == b)
410        {
411            if debug_lz {
412                eprintln!(
413                    "RAGC_LZ: target == reference, returning empty (len={})",
414                    target.len()
415                );
416            }
417            return encoded;
418        }
419
420        if debug_lz {
421            eprintln!(
422                "RAGC_LZ_START: ref_len={} target_len={} min_match={} ht_lp_size={}",
423                self.reference_len,
424                target.len(),
425                self.min_match_len,
426                self.ht_lp.len()
427            );
428            eprintln!("RAGC_LZ_ENCODING_TRACE: Starting LZ encoding");
429        }
430
431        let text_size = target.len();
432        let mut i = 0;
433        let mut pred_pos = 0u32;
434        let mut no_prev_literals = 0usize;
435        let mut x_prev: Option<u64> = None;
436
437        while i + (self.key_len as usize) < text_size {
438            // Get k-mer code
439            let x = if let Some(prev) = x_prev {
440                if no_prev_literals > 0 {
441                    self.get_code_skip1(prev, &target[i..])
442                } else {
443                    self.get_code(&target[i..])
444                }
445            } else {
446                self.get_code(&target[i..])
447            };
448
449            x_prev = x;
450
451            if x.is_none() {
452                // Check for N-run
453                let nrun_len = self.get_nrun_len(&target[i..], text_size - i);
454
455                if nrun_len >= MIN_NRUN_LEN {
456                    if debug_lz && nrun_count < 5 {
457                        eprintln!("RAGC_LZ_NRUN: i={} len={}", i, nrun_len);
458                    }
459                    nrun_count += 1;
460                    self.encode_nrun(nrun_len, &mut encoded);
461                    i += nrun_len as usize;
462                    no_prev_literals = 0;
463                } else {
464                    // Single literal
465                    self.encode_literal(target[i], &mut encoded);
466                    i += 1;
467                    pred_pos += 1;
468                    no_prev_literals += 1;
469                }
470                continue;
471            }
472
473            // Try to find match
474            let kmer_code = x.unwrap();
475            let hash = MurMur64Hash::hash(kmer_code);
476            let max_len = text_size - i;
477
478            if let Some((match_pos, len_bck, len_fwd)) =
479                self.find_best_match_lp(kmer_code, hash, target, i, max_len, no_prev_literals)
480            {
481                // Handle backward extension
482                if len_bck > 0 {
483                    for _ in 0..len_bck {
484                        encoded.pop();
485                    }
486                    i -= len_bck as usize;
487                    pred_pos -= len_bck;
488                }
489
490                // Check if this is a match to end of sequence
491                // C++ AGC (line 781): i + len_bck + len_fwd == text_size && match_pos + len_bck + len_fwd == reference.size() - key_len
492                // But match_pos was already adjusted (line 762: match_pos -= len_bck), so:
493                // After C++ adjustment: (match_pos_adjusted) + len_bck + len_fwd = original_match_pos + len_fwd
494                // Since our match_pos is NOT adjusted yet, we check: match_pos + len_fwd == reference_len
495                let total_len = len_bck + len_fwd;
496                let len_to_encode = if i + (total_len as usize) == text_size
497                    && (match_pos as usize) + (len_fwd as usize) == self.reference_len
498                {
499                    None // Match to end
500                } else {
501                    Some(total_len)
502                };
503
504                let adjusted_match_pos = match_pos - len_bck;
505
506                // C++ AGC optimization (lz_diff.cpp lines 769-779): when match_pos == pred_pos,
507                // convert preceding literals that match the reference to '!' for better compression.
508                // IMPORTANT: This must be done BEFORE encode_match, so the last bytes in buffer are literals.
509                // The '!' character is decoded by looking up reference[pred_pos].
510                let mut bang_replacements = 0u32;
511                if adjusted_match_pos == pred_pos {
512                    let e_size = encoded.len();
513                    // C++: for (uint32_t i = 1; i < e_size && i < match_pos; ++i)
514                    let max_scan = e_size.min(adjusted_match_pos as usize);
515                    for scan_i in 1..max_scan {
516                        let enc_idx = e_size - scan_i;
517                        let c = encoded[enc_idx];
518                        // Stop if not a literal (A-Z range)
519                        if c < b'A' || c > b'Z' {
520                            break;
521                        }
522                        // Check if literal matches reference at corresponding position
523                        let base = c - b'A';
524                        let ref_idx = adjusted_match_pos as usize - scan_i;
525                        if base == self.reference[ref_idx] {
526                            encoded[enc_idx] = b'!';
527                            bang_replacements += 1;
528                        }
529                    }
530                    bang_count += bang_replacements;
531                }
532
533                if debug_lz && match_count < 10 {
534                    eprintln!("RAGC_LZ_MATCH: i={} match_pos={} len_bck={} len_fwd={} total={} pred_pos={} bangs={}",
535                        i, match_pos, len_bck, len_fwd, total_len, pred_pos, bang_replacements);
536                }
537
538                match_count += 1;
539                total_match_len += total_len as u64;
540                self.encode_match(adjusted_match_pos, len_to_encode, pred_pos, &mut encoded);
541
542                pred_pos = adjusted_match_pos + total_len;
543                i += total_len as usize;
544                no_prev_literals = 0;
545            } else {
546                // No match, encode literal
547                if debug_lz && literal_count < 10 {
548                    eprintln!("RAGC_LZ_LITERAL: i={} base={}", i, target[i]);
549                }
550                literal_count += 1;
551                self.encode_literal(target[i], &mut encoded);
552                i += 1;
553                pred_pos += 1;
554                no_prev_literals += 1;
555            }
556        }
557
558        // Encode remaining bases as literals
559        while i < text_size {
560            if debug_lz && literal_count < 10 {
561                eprintln!("RAGC_LZ_LITERAL_TAIL: i={} base={}", i, target[i]);
562            }
563            literal_count += 1;
564            self.encode_literal(target[i], &mut encoded);
565            i += 1;
566        }
567
568        if debug_lz {
569            let avg_match_len = if match_count > 0 {
570                total_match_len as f64 / match_count as f64
571            } else {
572                0.0
573            };
574            let compression_ratio = if target.len() > 0 {
575                encoded.len() as f64 / target.len() as f64
576            } else {
577                0.0
578            };
579            let bases_covered_by_matches = total_match_len;
580            let bases_as_literals = literal_count as u64;
581            let coverage_pct = if target.len() > 0 {
582                100.0 * bases_covered_by_matches as f64 / target.len() as f64
583            } else {
584                0.0
585            };
586
587            eprintln!("RAGC_LZ_END: matches={} (avg_len={:.1}) literals={} nruns={} bangs={} encoded_len={}",
588                match_count, avg_match_len, literal_count, nrun_count, bang_count, encoded.len());
589            eprintln!(
590                "RAGC_LZ_SUMMARY: target_len={} match_coverage={}/{} ({:.1}%) ratio={:.3}",
591                target.len(),
592                bases_covered_by_matches,
593                target.len(),
594                coverage_pct,
595                compression_ratio
596            );
597
598            // Debug: Show first/last 20 bytes when 0 matches - helps detect orientation mismatch
599            if match_count == 0 && target.len() > 40 {
600                let ref_first: String = self
601                    .reference
602                    .iter()
603                    .take(20)
604                    .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
605                    .collect();
606                let ref_last: String = self.reference
607                    [self.reference_len.saturating_sub(20)..self.reference_len]
608                    .iter()
609                    .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
610                    .collect();
611                let tgt_first: String = target
612                    .iter()
613                    .take(20)
614                    .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
615                    .collect();
616                let tgt_last: String = target[target.len().saturating_sub(20)..]
617                    .iter()
618                    .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
619                    .collect();
620                eprintln!(
621                    "RAGC_ZERO_MATCH_DATA: ref_first={} ref_last={} tgt_first={} tgt_last={}",
622                    ref_first, ref_last, tgt_first, tgt_last
623                );
624                // Check if target matches reverse of reference (strong orientation mismatch signal)
625                let ref_rc_first: String = self.reference
626                    [self.reference_len.saturating_sub(20)..self.reference_len]
627                    .iter()
628                    .rev()
629                    .map(|&b| match b {
630                        0 => 3,
631                        1 => 2,
632                        2 => 1,
633                        3 => 0,
634                        _ => b,
635                    })
636                    .map(|b| if b < 4 { (b'A' + b) as char } else { 'N' })
637                    .collect();
638                if tgt_first == ref_rc_first {
639                    eprintln!("RAGC_ZERO_MATCH_ORIENTATION: Target matches RC of reference - ORIENTATION MISMATCH DETECTED!");
640                }
641            }
642
643            // Warning if encoding is bloated
644            if encoded.len() > target.len() {
645                eprintln!(
646                    "RAGC_LZ_WARNING: encoded LARGER than target! {} vs {} bytes (+{})",
647                    encoded.len(),
648                    target.len(),
649                    encoded.len() - target.len()
650                );
651            }
652        }
653
654        encoded
655    }
656
657    /// Decode encoded sequence using reference
658    pub fn decode(&self, encoded: &[u8]) -> Vec<u8> {
659        let debug_decode = crate::env_cache::debug_lz_decode();
660        let mut decoded = Vec::new();
661        let mut pred_pos = 0usize;
662        let mut i = 0;
663        let mut op_count = 0;
664
665        while i < encoded.len() {
666            if self.is_literal(encoded[i]) {
667                let c = self.decode_literal(encoded[i]);
668                let actual_c = if c == b'!' {
669                    self.reference[pred_pos]
670                } else {
671                    c
672                };
673                decoded.push(actual_c);
674                if debug_decode && op_count < 10 {
675                    eprintln!("  LZ_DECODE op={}: LITERAL byte={} c={} actual_c={} pred_pos={} decoded_len={}",
676                        op_count, encoded[i], c, actual_c, pred_pos, decoded.len());
677                }
678                pred_pos += 1;
679                i += 1;
680                op_count += 1;
681            } else if encoded[i] == N_RUN_STARTER_CODE {
682                let (len, consumed) = self.decode_nrun(&encoded[i..]);
683                decoded.resize(decoded.len() + len as usize, N_CODE);
684                if debug_decode && op_count < 10 {
685                    eprintln!(
686                        "  LZ_DECODE op={}: NRUN len={} consumed={} decoded_len={}",
687                        op_count,
688                        len,
689                        consumed,
690                        decoded.len()
691                    );
692                }
693                i += consumed;
694                op_count += 1;
695            } else {
696                // It's a match
697                let (ref_pos, len, consumed) = self.decode_match(&encoded[i..], pred_pos);
698                let actual_len = if len == u32::MAX {
699                    // Match to end: use original reference length (before padding)
700                    self.reference_len - ref_pos
701                } else {
702                    len as usize
703                };
704                if debug_decode && op_count < 10 {
705                    eprintln!("  LZ_DECODE op={}: MATCH ref_pos={} len={} actual_len={} consumed={} pred_pos={} ref_len={} decoded_len_before={}",
706                        op_count, ref_pos, len, actual_len, consumed, pred_pos, self.reference_len, decoded.len());
707                }
708                decoded.extend_from_slice(&self.reference[ref_pos..ref_pos + actual_len]);
709                pred_pos = ref_pos + actual_len;
710                i += consumed;
711                op_count += 1;
712            }
713        }
714
715        if debug_decode {
716            eprintln!(
717                "  LZ_DECODE: total_ops={} final_decoded_len={} ref_len={}",
718                op_count,
719                decoded.len(),
720                self.reference_len
721            );
722        }
723
724        // Debug: trace when decoded is significantly longer than reference
725        if crate::env_cache::debug_lz_decode_full() && decoded.len() > self.reference_len + 50 {
726            eprintln!(
727                "  LZ_DECODE_BLOAT: ref_len={} decoded_len={} encoded_len={}",
728                self.reference_len,
729                decoded.len(),
730                encoded.len()
731            );
732            eprintln!(
733                "    Encoded first 100 bytes: {:?}",
734                &encoded[..encoded.len().min(100)]
735            );
736        }
737
738        decoded
739    }
740
741    /// Check if byte is a literal
742    fn is_literal(&self, c: u8) -> bool {
743        (b'A'..=b'A' + 20).contains(&c) || c == b'!'
744    }
745
746    /// Decode a literal
747    fn decode_literal(&self, c: u8) -> u8 {
748        if c == b'!' {
749            b'!'
750        } else {
751            c - b'A'
752        }
753    }
754
755    /// Decode an N-run, returns (length, bytes_consumed)
756    fn decode_nrun(&self, data: &[u8]) -> (u32, usize) {
757        let mut i = 1; // Skip starter code
758        let (raw_len, len_bytes) = self.read_int(&data[i..]);
759        i += len_bytes;
760        i += 1; // Skip N_CODE suffix
761        ((raw_len as u32) + MIN_NRUN_LEN, i)
762    }
763
764    /// Decode a match, returns (ref_pos, length, bytes_consumed)
765    /// Format: <pos>,<len>. or <pos>. (comma only present when length is specified)
766    fn decode_match(&self, data: &[u8], pred_pos: usize) -> (usize, u32, usize) {
767        let mut i = 0;
768        let (raw_pos, pos_bytes) = self.read_int(&data[i..]);
769        i += pos_bytes;
770
771        let ref_pos = ((pred_pos as i64) + raw_pos) as usize;
772
773        // C++ AGC format has two cases:
774        // - With length: <pos>,<len>.
775        // - Without length (to end): <pos>.
776
777        // Bounds check before reading next character
778        if i >= data.len() {
779            eprintln!("ERROR: decode_match - expected comma or period at position {} but data length is {}", i, data.len());
780            eprintln!(
781                "  ref_pos={}, data prefix: {:?}",
782                ref_pos,
783                &data[..data.len().min(20)]
784            );
785            panic!("Malformed LZ match encoding: missing separator after position");
786        }
787
788        let len = if data[i] == b'.' {
789            // "To end" case: <pos>.
790            i += 1; // Skip period
791            u32::MAX // Sentinel for "to end of sequence"
792        } else if data[i] == b',' {
793            // With length: <pos>,<len>.
794            i += 1; // Skip comma
795
796            if i >= data.len() {
797                eprintln!(
798                    "ERROR: decode_match - expected length at position {} but data length is {}",
799                    i,
800                    data.len()
801                );
802                eprintln!(
803                    "  ref_pos={}, data prefix: {:?}",
804                    ref_pos,
805                    &data[..data.len().min(20)]
806                );
807                panic!("Malformed LZ match encoding: missing length after comma");
808            }
809
810            let (raw_len, len_bytes) = self.read_int(&data[i..]);
811            i += len_bytes;
812            i += 1; // Skip period
813            (raw_len as u32) + self.min_match_len
814        } else {
815            eprintln!(
816                "ERROR: decode_match - unexpected character {} at position {}",
817                data[i], i
818            );
819            eprintln!(
820                "  ref_pos={}, data prefix: {:?}",
821                ref_pos,
822                &data[..data.len().min(20)]
823            );
824            panic!("Malformed LZ match encoding: expected comma or period");
825        };
826
827        (ref_pos, len, i)
828    }
829
830    /// Read ASCII decimal integer, returns (value, bytes_consumed)
831    fn read_int(&self, data: &[u8]) -> (i64, usize) {
832        let mut i = 0;
833        let mut is_neg = false;
834
835        if data[i] == b'-' {
836            is_neg = true;
837            i += 1;
838        }
839
840        let mut x = 0i64;
841        while i < data.len() && data[i] >= b'0' && data[i] <= b'9' {
842            x = x * 10 + ((data[i] - b'0') as i64);
843            i += 1;
844        }
845
846        if is_neg {
847            x = -x;
848        }
849
850        (x, i)
851    }
852
853    /// Get coding cost vector for target sequence
854    /// This computes the per-position cost of encoding the target against the reference
855    /// Returns a vector where v_costs[i] is the cost of encoding position i
856    /// If prefix_costs=true, match cost is placed at start of match; otherwise at end
857    pub fn get_coding_cost_vector(&self, target: &Contig, prefix_costs: bool) -> Vec<u32> {
858        let mut v_costs = Vec::with_capacity(target.len());
859
860        if self.reference.is_empty() {
861            return v_costs;
862        }
863
864        let text_size = target.len();
865        let mut i = 0;
866        let mut pred_pos = 0u32;
867        let mut no_prev_literals = 0usize;
868        let mut x_prev: Option<u64> = None;
869
870        while i + (self.key_len as usize) < text_size {
871            // Get k-mer code
872            let x = if let Some(prev) = x_prev {
873                if no_prev_literals > 0 {
874                    self.get_code_skip1(prev, &target[i..])
875                } else {
876                    self.get_code(&target[i..])
877                }
878            } else {
879                self.get_code(&target[i..])
880            };
881
882            x_prev = x;
883
884            if x.is_none() {
885                // Check for N-run
886                let nrun_len = self.get_nrun_len(&target[i..], text_size - i);
887
888                if nrun_len >= MIN_NRUN_LEN {
889                    let tc = self.coding_cost_nrun(nrun_len);
890                    if prefix_costs {
891                        v_costs.push(tc);
892                        for _ in 1..nrun_len {
893                            v_costs.push(0);
894                        }
895                    } else {
896                        for _ in 1..nrun_len {
897                            v_costs.push(0);
898                        }
899                        v_costs.push(tc);
900                    }
901                    i += nrun_len as usize;
902                    no_prev_literals = 0;
903                } else {
904                    // Single literal: cost is 1
905                    v_costs.push(1);
906                    i += 1;
907                    pred_pos += 1;
908                    no_prev_literals += 1;
909                }
910                continue;
911            }
912
913            // Try to find match
914            let kmer_code = x.unwrap();
915            let hash = MurMur64Hash::hash(kmer_code);
916            let max_len = text_size - i;
917
918            if let Some((match_pos, len_bck, len_fwd)) =
919                self.find_best_match_lp(kmer_code, hash, target, i, max_len, no_prev_literals)
920            {
921                // Handle backward extension
922                if len_bck > 0 {
923                    for _ in 0..len_bck {
924                        v_costs.pop();
925                    }
926                    i -= len_bck as usize;
927                    pred_pos -= len_bck;
928                }
929
930                let total_len = len_bck + len_fwd;
931                let tc = self.coding_cost_match(match_pos - len_bck, total_len, pred_pos);
932
933                if prefix_costs {
934                    v_costs.push(tc);
935                    for _ in 1..total_len {
936                        v_costs.push(0);
937                    }
938                } else {
939                    for _ in 1..total_len {
940                        v_costs.push(0);
941                    }
942                    v_costs.push(tc);
943                }
944
945                pred_pos = match_pos - len_bck + total_len;
946                i += total_len as usize;
947                no_prev_literals = 0;
948            } else {
949                // No match, literal cost is 1
950                v_costs.push(1);
951                i += 1;
952                pred_pos += 1;
953                no_prev_literals += 1;
954            }
955        }
956
957        // Remaining bases are literals
958        while i < text_size {
959            v_costs.push(1);
960            i += 1;
961        }
962
963        v_costs
964    }
965
966    /// Compute decimal digit length like C++ int_len()
967    fn int_len(x: u32) -> u32 {
968        if x < 10 {
969            1
970        } else if x < 100 {
971            2
972        } else if x < 1_000 {
973            3
974        } else if x < 10_000 {
975            4
976        } else if x < 100_000 {
977            5
978        } else if x < 1_000_000 {
979            6
980        } else if x < 10_000_000 {
981            7
982        } else if x < 100_000_000 {
983            8
984        } else if x < 1_000_000_000 {
985            9
986        } else {
987            10
988        }
989    }
990
991    /// Compute coding cost for N-run (matches C++ coding_cost_Nrun)
992    fn coding_cost_nrun(&self, len: u32) -> u32 {
993        let delta = len - MIN_NRUN_LEN;
994        // starter + decimal digits + suffix
995        1 + Self::int_len(delta) + 1
996    }
997
998    /// Compute coding cost for match (matches C++ coding_cost_match)
999    fn coding_cost_match(&self, match_pos: u32, len: u32, pred_pos: u32) -> u32 {
1000        let dif_pos = (match_pos as i32) - (pred_pos as i32);
1001        let pos_digits = if dif_pos >= 0 {
1002            Self::int_len(dif_pos as u32)
1003        } else {
1004            Self::int_len((-dif_pos) as u32) + 1 // sign
1005        };
1006
1007        let delta = len - self.min_match_len;
1008        let len_digits = Self::int_len(delta);
1009
1010        pos_digits + len_digits + 2 // pos + ',' + len + '.'
1011    }
1012
1013    /// Compute uint_len like C++ CLZDiff_V2::uint_len (caps at 8 digits)
1014    fn uint_len_v2(x: u32) -> u32 {
1015        if x < 10 {
1016            1
1017        } else if x < 100 {
1018            2
1019        } else if x < 1_000 {
1020            3
1021        } else if x < 10_000 {
1022            4
1023        } else if x < 100_000 {
1024            5
1025        } else if x < 1_000_000 {
1026            6
1027        } else if x < 10_000_000 {
1028            7
1029        } else {
1030            8
1031        }
1032    }
1033
1034    /// Compute int_len like C++ CLZDiff_V2::int_len
1035    fn int_len_v2(x: i32) -> u32 {
1036        if x >= 0 {
1037            Self::uint_len_v2(x as u32)
1038        } else {
1039            1 + Self::uint_len_v2((-x) as u32)
1040        }
1041    }
1042
1043    /// Compute cost_match like C++ CLZDiff_V2::cost_match
1044    /// Note: len == u32::MAX means "match to end of sequence" (no length encoding)
1045    fn cost_match_v2(&self, ref_pos: u32, len: u32, pred_pos: u32) -> u32 {
1046        let dif_pos = (ref_pos as i32) - (pred_pos as i32);
1047        let mut r = Self::int_len_v2(dif_pos);
1048
1049        if len != u32::MAX {
1050            r += 1 + Self::uint_len_v2(len - self.min_match_len);
1051        }
1052
1053        r + 1 // +1 for '.' terminator
1054    }
1055
1056    /// Compute cost_Nrun like C++ CLZDiff_V2::cost_Nrun
1057    fn cost_nrun_v2(len: u32) -> u32 {
1058        2 + Self::uint_len_v2(len - MIN_NRUN_LEN)
1059    }
1060
1061    /// Estimate encoding cost without actually encoding (matches C++ CLZDiff_V2::Estimate)
1062    /// This is faster than full encode and used for terminator grouping decisions.
1063    ///
1064    /// # Arguments
1065    /// * `target` - Target sequence to estimate encoding cost for
1066    /// * `bound` - Early termination bound (return early if cost exceeds this)
1067    ///
1068    /// # Returns
1069    /// Estimated encoding cost in bytes
1070    pub fn estimate(&self, target: &Contig, bound: u32) -> u32 {
1071        if self.ht_lp.is_empty() {
1072            return target.len() as u32; // No index, cost is all literals
1073        }
1074
1075        let text_size = target.len() as u32;
1076
1077        // Quick check for equal sequences
1078        if text_size == self.reference_len as u32 {
1079            if target
1080                .iter()
1081                .zip(self.reference.iter())
1082                .all(|(a, b)| a == b)
1083            {
1084                return 0; // Equal sequences
1085            }
1086        }
1087
1088        let mut est_cost = 0u32;
1089        let mut i = 0u32;
1090        let mut pred_pos = 0u32;
1091        let mut no_prev_literals = 0u32;
1092        let mut x_prev: Option<u64> = None;
1093        let text_ptr = target.as_slice();
1094
1095        while (i + self.key_len) < text_size {
1096            // Early termination
1097            if est_cost > bound {
1098                return est_cost;
1099            }
1100
1101            // Get k-mer code
1102            let x = if x_prev.is_some() && no_prev_literals > 0 {
1103                self.get_code_skip1(x_prev.unwrap(), &text_ptr[i as usize..])
1104            } else {
1105                self.get_code(&text_ptr[i as usize..])
1106            };
1107            x_prev = x;
1108
1109            if x.is_none() {
1110                // Check for N-run
1111                let nrun_len = self.get_nrun_len(&text_ptr[i as usize..], (text_size - i) as usize);
1112
1113                if nrun_len >= MIN_NRUN_LEN {
1114                    est_cost += Self::cost_nrun_v2(nrun_len);
1115                    i += nrun_len;
1116                    no_prev_literals = 0;
1117                } else {
1118                    // Single literal
1119                    est_cost += 1;
1120                    i += 1;
1121                    pred_pos += 1;
1122                    no_prev_literals += 1;
1123                }
1124                continue;
1125            }
1126
1127            // Look up k-mer in linear-probing hash table
1128            let kmer_code = x.unwrap();
1129            let hash = MurMur64Hash::hash(kmer_code);
1130            let max_len = (text_size - i) as usize;
1131
1132            if let Some((match_pos, len_bck, len_fwd)) = self.find_best_match_lp(
1133                kmer_code,
1134                hash,
1135                target,
1136                i as usize,
1137                max_len,
1138                no_prev_literals as usize,
1139            ) {
1140                let total_len = len_bck + len_fwd;
1141                // CRITICAL: C++ AGC's Estimate uses match_pos directly (NOT adjusted by len_bck)
1142                // This differs from the actual encode which does adjust for backward extension
1143
1144                // Check if this is a match to end of sequence
1145                // C++ AGC: i + len_bck + len_fwd == text_size && match_pos + len_bck + len_fwd == reference.size() - key_len
1146                let is_end_match = (i + total_len) == text_size
1147                    && (match_pos + total_len) as usize == self.reference_len;
1148
1149                if is_end_match {
1150                    est_cost += self.cost_match_v2(match_pos, u32::MAX, pred_pos);
1151                } else {
1152                    est_cost += self.cost_match_v2(match_pos, total_len, pred_pos);
1153                }
1154
1155                // C++ AGC: pred_pos = match_pos + len_bck + len_fwd (NOT adjusted)
1156                pred_pos = match_pos + total_len;
1157                i += total_len;
1158                no_prev_literals = 0;
1159            } else {
1160                // No match, literal cost is 1
1161                est_cost += 1;
1162                i += 1;
1163                pred_pos += 1;
1164                no_prev_literals += 1;
1165            }
1166        }
1167
1168        // Remaining bases are literals
1169        est_cost += text_size - i;
1170
1171        est_cost
1172    }
1173}
1174
1175#[cfg(test)]
1176mod tests {
1177    use super::*;
1178
1179    #[test]
1180    fn test_simple_literal() {
1181        let reference = vec![0, 0, 0, 1, 1, 1];
1182        let target = vec![0, 1, 2, 3];
1183
1184        let mut lz = LZDiff::new(18);
1185        lz.prepare(&reference);
1186
1187        let encoded = lz.encode(&target);
1188        let decoded = lz.decode(&encoded);
1189
1190        assert_eq!(target, decoded);
1191    }
1192
1193    #[test]
1194    fn test_identical_sequences() {
1195        let reference = vec![0, 1, 2, 3, 0, 1, 2, 3];
1196        let target = reference.clone();
1197
1198        let mut lz = LZDiff::new(18);
1199        lz.prepare(&reference);
1200
1201        let encoded = lz.encode(&target);
1202        // Should be empty (optimization)
1203        assert_eq!(encoded.len(), 0);
1204
1205        // Special handling for empty encoding
1206        let decoded = if encoded.is_empty() && target.len() == reference.len() {
1207            reference.clone()
1208        } else {
1209            lz.decode(&encoded)
1210        };
1211
1212        assert_eq!(target, decoded);
1213    }
1214}
ragc_core/lz_diff.rs

ragc_core/
lz_diff.rs