1#![allow(clippy::same_item_push)]
5
6use ragc_common::{hash::MurMur64Hash, types::Contig};
7use std::sync::atomic::{AtomicU64, Ordering};
8
9static TOTAL_MATCHES: AtomicU64 = AtomicU64::new(0);
11static TOTAL_LITERALS: AtomicU64 = AtomicU64::new(0);
12static TOTAL_BANG_REPLACEMENTS: AtomicU64 = AtomicU64::new(0);
13static TOTAL_ENCODED_BYTES: AtomicU64 = AtomicU64::new(0);
14static TOTAL_NRUNS: AtomicU64 = AtomicU64::new(0);
15
16pub fn print_lz_stats() {
18 eprintln!(
19 "LZ_STATS: matches={} literals={} bang_replacements={} nruns={} encoded_bytes={}",
20 TOTAL_MATCHES.load(Ordering::Relaxed),
21 TOTAL_LITERALS.load(Ordering::Relaxed),
22 TOTAL_BANG_REPLACEMENTS.load(Ordering::Relaxed),
23 TOTAL_NRUNS.load(Ordering::Relaxed),
24 TOTAL_ENCODED_BYTES.load(Ordering::Relaxed)
25 );
26}
27
28const N_CODE: u8 = 4;
30const N_RUN_STARTER_CODE: u8 = 30;
31const MIN_NRUN_LEN: u32 = 4;
32const MAX_NO_TRIES: usize = 64;
33const HASHING_STEP: usize = 4; pub struct LZDiff {
37 reference: Vec<u8>,
38 reference_len: usize, ht_lp: Vec<u32>, ht_mask: u64,
42 min_match_len: u32,
43 key_len: u32,
44 key_mask: u64,
45}
46
47impl LZDiff {
48 pub fn new(min_match_len: u32) -> Self {
50 let key_len = min_match_len - (HASHING_STEP as u32) + 1;
51 let key_mask = if key_len >= 32 {
52 !0u64
53 } else {
54 (1u64 << (2 * key_len)) - 1
55 };
56
57 LZDiff {
58 reference: Vec::new(),
59 reference_len: 0,
60 ht_lp: Vec::new(),
61 ht_mask: 0,
62 min_match_len,
63 key_len,
64 key_mask,
65 }
66 }
67
68 pub fn prepare(&mut self, reference: &Contig) {
70 let debug_lz = crate::env_cache::debug_lz_enabled();
71
72 self.reference = reference.clone();
73 self.reference_len = reference.len(); self.reference
76 .resize(self.reference.len() + self.key_len as usize, 31);
77
78 self.build_index_lp();
79
80 if debug_lz {
81 eprintln!(
82 "RAGC_LZ_PREPARE: ref_len={} key_len={} ht_lp_size={} ht_mask={:#x}",
83 self.reference_len,
84 self.key_len,
85 self.ht_lp.len(),
86 self.ht_mask
87 );
88 let filled = self.ht_lp.iter().filter(|&&x| x != u32::MAX).count();
90 eprintln!(
91 "RAGC_LZ_PREPARE: ht_lp filled={}/{} ({:.1}%)",
92 filled,
93 self.ht_lp.len(),
94 100.0 * filled as f64 / self.ht_lp.len() as f64
95 );
96 }
97 }
98
99 fn build_index_lp(&mut self) {
101 let mut ht_size: u64 = 0;
103 let mut no_prev_valid: u32 = 0;
104 let mut cnt_mod: u32 = 0;
105 let key_len_mod: u32 = self.key_len % (HASHING_STEP as u32);
106 for &c in &self.reference {
107 if c < 4 {
108 no_prev_valid += 1;
109 } else {
110 no_prev_valid = 0;
111 }
112 cnt_mod += 1;
113 if cnt_mod == HASHING_STEP as u32 {
114 cnt_mod = 0;
115 }
116 if cnt_mod == key_len_mod && no_prev_valid >= self.key_len {
117 ht_size += 1;
118 }
119 }
120
121 let mut ht_size = (ht_size as f64 / 0.7) as u64;
123 if ht_size == 0 {
124 ht_size = 1;
125 }
126 while (ht_size & (ht_size - 1)) != 0 {
127 ht_size &= ht_size - 1;
128 }
129 ht_size <<= 1;
130 if ht_size < 8 {
131 ht_size = 8;
132 }
133
134 self.ht_mask = ht_size - 1;
135 self.ht_lp.clear();
136 self.ht_lp.resize(ht_size as usize, u32::MAX);
137
138 let ref_len = self.reference.len();
140 let mut i = 0usize;
141 while i + (self.key_len as usize) < ref_len {
142 if let Some(code) = self.get_code(&self.reference[i..]) {
143 let base = (MurMur64Hash::hash(code) & self.ht_mask) as usize;
144 for j in 0..MAX_NO_TRIES {
145 let idx = (base + j) & (self.ht_mask as usize);
146 if self.ht_lp[idx] == u32::MAX {
147 self.ht_lp[idx] = (i / HASHING_STEP) as u32;
148 break;
149 }
150 }
151 }
152 i += HASHING_STEP;
153 }
154 }
155
156 #[allow(clippy::needless_range_loop)]
158 fn get_code(&self, seq: &[u8]) -> Option<u64> {
159 let mut code = 0u64;
160 for i in 0..(self.key_len as usize) {
161 if seq[i] > 3 {
162 return None; }
164 code = (code << 2) | (seq[i] as u64);
165 }
166 Some(code)
167 }
168
169 fn get_code_skip1(&self, prev_code: u64, seq: &[u8]) -> Option<u64> {
171 let last_base_idx = (self.key_len as usize) - 1;
172 if seq[last_base_idx] > 3 {
173 return None;
174 }
175 let code = ((prev_code << 2) & self.key_mask) | (seq[last_base_idx] as u64);
176 Some(code)
177 }
178
179 fn get_nrun_len(&self, seq: &[u8], max_len: usize) -> u32 {
181 if seq.len() < 3 || seq[0] != N_CODE || seq[1] != N_CODE || seq[2] != N_CODE {
182 return 0;
183 }
184
185 let mut len = 3;
186 while len < max_len && seq[len] == N_CODE {
187 len += 1;
188 }
189 len as u32
190 }
191
192 fn encode_literal(&self, base: u8, encoded: &mut Vec<u8>) {
194 encoded.push(b'A' + base);
195 }
196
197 fn encode_nrun(&self, len: u32, encoded: &mut Vec<u8>) {
199 encoded.push(N_RUN_STARTER_CODE);
200 self.append_int(encoded, (len - MIN_NRUN_LEN) as i64);
201 encoded.push(N_CODE);
202 }
203
204 fn encode_match(&self, ref_pos: u32, len: Option<u32>, pred_pos: u32, encoded: &mut Vec<u8>) {
209 let dif_pos = (ref_pos as i32) - (pred_pos as i32);
210 self.append_int(encoded, dif_pos as i64);
211
212 if let Some(match_len) = len {
215 encoded.push(b',');
216 self.append_int(encoded, (match_len - self.min_match_len) as i64);
217 }
218
219 encoded.push(b'.');
220 }
221
222 fn append_int(&self, text: &mut Vec<u8>, mut x: i64) {
224 if x == 0 {
225 text.push(b'0');
226 return;
227 }
228
229 if x < 0 {
230 text.push(b'-');
231 x = -x;
232 }
233
234 let start_pos = text.len();
236 while x > 0 {
237 text.push(b'0' + (x % 10) as u8);
238 x /= 10;
239 }
240
241 text[start_pos..].reverse();
243 }
244
245 fn find_best_match_lp(
247 &self,
248 kmer_code: u64,
249 hash: u64,
250 target: &[u8],
251 text_pos: usize,
252 max_len: usize,
253 no_prev_literals: usize,
254 ) -> Option<(u32, u32, u32)> {
255 let debug_lz = crate::env_cache::debug_lz_enabled();
256 if self.ht_lp.is_empty() {
257 return None;
258 }
259
260 let mut best_ref_pos = 0u32;
261 let mut best_len_bck = 0u32;
262 let mut best_len_fwd = 0u32;
263 let mut min_to_update = self.min_match_len as usize;
264
265 let ht_pos = (hash & self.ht_mask) as usize;
266 let mut probes = 0usize;
267 let mut found_match = false;
268
269 for j in 0..MAX_NO_TRIES {
270 let idx = (ht_pos + j) & (self.ht_mask as usize);
271 let slot = self.ht_lp[idx];
272 probes += 1;
273 if slot == u32::MAX {
274 break;
275 }
276 found_match = true;
277
278 let h_pos = (slot as usize) * HASHING_STEP;
279 if h_pos >= self.reference.len() {
280 continue;
281 }
282
283 if let Some(ref_kmer_code) = self.get_code(&self.reference[h_pos..]) {
285 if ref_kmer_code != kmer_code {
286 if debug_lz && text_pos < 10 && probes < 3 {
288 eprintln!(
289 "RAGC_LZ_COLLISION: text_pos={} probe={} h_pos={} kmer_mismatch",
290 text_pos, j, h_pos
291 );
292 }
293 continue;
294 }
295 } else {
296 if debug_lz && text_pos < 10 && probes < 3 {
298 eprintln!(
299 "RAGC_LZ_INVALID_KMER: text_pos={} probe={} h_pos={} contains_N",
300 text_pos, j, h_pos
301 );
302 }
303 continue;
304 }
305
306 let ref_ptr = &self.reference[h_pos..];
307 let text_ptr = &target[text_pos..];
308 let f_len = Self::matching_length(text_ptr, ref_ptr, max_len);
309
310 if debug_lz && text_pos < 5 && j == 0 {
311 let kmer_len = self.key_len as usize;
313 let ref_kmer: String = ref_ptr
314 .iter()
315 .take(kmer_len)
316 .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
317 .collect();
318 let tgt_kmer: String = text_ptr
319 .iter()
320 .take(kmer_len)
321 .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
322 .collect();
323 let kmer_match = ref_ptr
324 .iter()
325 .zip(text_ptr.iter())
326 .take(kmer_len)
327 .all(|(a, b)| a == b);
328
329 eprintln!(
330 "RAGC_LZ_KMER: text_pos={} h_pos={} kmer_match={} f_len={}",
331 text_pos, h_pos, kmer_match, f_len
332 );
333 eprintln!(" ref_kmer[{}]: {}", kmer_len, ref_kmer);
334 eprintln!(" tgt_kmer[{}]: {}", kmer_len, tgt_kmer);
335 }
336
337 if f_len >= self.key_len as usize {
338 let mut b_len = 0usize;
339 let max_back = no_prev_literals.min(h_pos).min(text_pos);
340 while b_len < max_back {
341 if target[text_pos - b_len - 1] != self.reference[h_pos - b_len - 1] {
342 break;
343 }
344 b_len += 1;
345 }
346 if b_len + f_len > min_to_update {
347 best_len_bck = b_len as u32;
348 best_len_fwd = f_len as u32;
349 best_ref_pos = h_pos as u32;
350 min_to_update = b_len + f_len;
351 } else if debug_lz && text_pos < 10 && probes < 3 {
352 eprintln!("RAGC_LZ_TOO_SHORT: text_pos={} probe={} h_pos={} total_len={} < min_to_update={}",
353 text_pos, j, h_pos, b_len + f_len, min_to_update);
354 }
355 } else if debug_lz && text_pos < 10 && probes < 3 {
356 eprintln!(
357 "RAGC_LZ_FLEN_SHORT: text_pos={} probe={} h_pos={} f_len={} < key_len={}",
358 text_pos, j, h_pos, f_len, self.key_len
359 );
360 }
361 }
362
363 if debug_lz && text_pos < 100 {
364 eprintln!(
365 "RAGC_LZ_LOOKUP: text_pos={} probes={} found_slot={} best_len={}",
366 text_pos,
367 probes,
368 found_match,
369 best_len_bck + best_len_fwd
370 );
371 }
372
373 if (best_len_bck + best_len_fwd) as usize >= self.min_match_len as usize {
374 Some((best_ref_pos, best_len_bck, best_len_fwd))
375 } else {
376 None
377 }
378 }
379
380 fn matching_length(s1: &[u8], s2: &[u8], max_len: usize) -> usize {
382 let mut len = 0;
383 let max = max_len.min(s1.len()).min(s2.len());
384 while len < max && s1[len] == s2[len] {
385 len += 1;
386 }
387 len
388 }
389
390 pub fn encode(&mut self, target: &Contig) -> Vec<u8> {
392 let mut encoded = Vec::with_capacity(target.len() / 2);
395
396 let debug_lz = crate::env_cache::debug_lz_enabled();
398 let mut match_count = 0u32;
399 let mut literal_count = 0u32;
400 let mut bang_count = 0u32;
401 let mut total_match_len = 0u64;
402 let mut nrun_count = 0u32;
403
404 if target.len() == self.reference_len
406 && target
407 .iter()
408 .zip(self.reference.iter())
409 .all(|(a, b)| a == b)
410 {
411 if debug_lz {
412 eprintln!(
413 "RAGC_LZ: target == reference, returning empty (len={})",
414 target.len()
415 );
416 }
417 return encoded;
418 }
419
420 if debug_lz {
421 eprintln!(
422 "RAGC_LZ_START: ref_len={} target_len={} min_match={} ht_lp_size={}",
423 self.reference_len,
424 target.len(),
425 self.min_match_len,
426 self.ht_lp.len()
427 );
428 eprintln!("RAGC_LZ_ENCODING_TRACE: Starting LZ encoding");
429 }
430
431 let text_size = target.len();
432 let mut i = 0;
433 let mut pred_pos = 0u32;
434 let mut no_prev_literals = 0usize;
435 let mut x_prev: Option<u64> = None;
436
437 while i + (self.key_len as usize) < text_size {
438 let x = if let Some(prev) = x_prev {
440 if no_prev_literals > 0 {
441 self.get_code_skip1(prev, &target[i..])
442 } else {
443 self.get_code(&target[i..])
444 }
445 } else {
446 self.get_code(&target[i..])
447 };
448
449 x_prev = x;
450
451 if x.is_none() {
452 let nrun_len = self.get_nrun_len(&target[i..], text_size - i);
454
455 if nrun_len >= MIN_NRUN_LEN {
456 if debug_lz && nrun_count < 5 {
457 eprintln!("RAGC_LZ_NRUN: i={} len={}", i, nrun_len);
458 }
459 nrun_count += 1;
460 self.encode_nrun(nrun_len, &mut encoded);
461 i += nrun_len as usize;
462 no_prev_literals = 0;
463 } else {
464 self.encode_literal(target[i], &mut encoded);
466 i += 1;
467 pred_pos += 1;
468 no_prev_literals += 1;
469 }
470 continue;
471 }
472
473 let kmer_code = x.unwrap();
475 let hash = MurMur64Hash::hash(kmer_code);
476 let max_len = text_size - i;
477
478 if let Some((match_pos, len_bck, len_fwd)) =
479 self.find_best_match_lp(kmer_code, hash, target, i, max_len, no_prev_literals)
480 {
481 if len_bck > 0 {
483 for _ in 0..len_bck {
484 encoded.pop();
485 }
486 i -= len_bck as usize;
487 pred_pos -= len_bck;
488 }
489
490 let total_len = len_bck + len_fwd;
496 let len_to_encode = if i + (total_len as usize) == text_size
497 && (match_pos as usize) + (len_fwd as usize) == self.reference_len
498 {
499 None } else {
501 Some(total_len)
502 };
503
504 let adjusted_match_pos = match_pos - len_bck;
505
506 let mut bang_replacements = 0u32;
511 if adjusted_match_pos == pred_pos {
512 let e_size = encoded.len();
513 let max_scan = e_size.min(adjusted_match_pos as usize);
515 for scan_i in 1..max_scan {
516 let enc_idx = e_size - scan_i;
517 let c = encoded[enc_idx];
518 if c < b'A' || c > b'Z' {
520 break;
521 }
522 let base = c - b'A';
524 let ref_idx = adjusted_match_pos as usize - scan_i;
525 if base == self.reference[ref_idx] {
526 encoded[enc_idx] = b'!';
527 bang_replacements += 1;
528 }
529 }
530 bang_count += bang_replacements;
531 }
532
533 if debug_lz && match_count < 10 {
534 eprintln!("RAGC_LZ_MATCH: i={} match_pos={} len_bck={} len_fwd={} total={} pred_pos={} bangs={}",
535 i, match_pos, len_bck, len_fwd, total_len, pred_pos, bang_replacements);
536 }
537
538 match_count += 1;
539 total_match_len += total_len as u64;
540 self.encode_match(adjusted_match_pos, len_to_encode, pred_pos, &mut encoded);
541
542 pred_pos = adjusted_match_pos + total_len;
543 i += total_len as usize;
544 no_prev_literals = 0;
545 } else {
546 if debug_lz && literal_count < 10 {
548 eprintln!("RAGC_LZ_LITERAL: i={} base={}", i, target[i]);
549 }
550 literal_count += 1;
551 self.encode_literal(target[i], &mut encoded);
552 i += 1;
553 pred_pos += 1;
554 no_prev_literals += 1;
555 }
556 }
557
558 while i < text_size {
560 if debug_lz && literal_count < 10 {
561 eprintln!("RAGC_LZ_LITERAL_TAIL: i={} base={}", i, target[i]);
562 }
563 literal_count += 1;
564 self.encode_literal(target[i], &mut encoded);
565 i += 1;
566 }
567
568 if debug_lz {
569 let avg_match_len = if match_count > 0 {
570 total_match_len as f64 / match_count as f64
571 } else {
572 0.0
573 };
574 let compression_ratio = if target.len() > 0 {
575 encoded.len() as f64 / target.len() as f64
576 } else {
577 0.0
578 };
579 let bases_covered_by_matches = total_match_len;
580 let bases_as_literals = literal_count as u64;
581 let coverage_pct = if target.len() > 0 {
582 100.0 * bases_covered_by_matches as f64 / target.len() as f64
583 } else {
584 0.0
585 };
586
587 eprintln!("RAGC_LZ_END: matches={} (avg_len={:.1}) literals={} nruns={} bangs={} encoded_len={}",
588 match_count, avg_match_len, literal_count, nrun_count, bang_count, encoded.len());
589 eprintln!(
590 "RAGC_LZ_SUMMARY: target_len={} match_coverage={}/{} ({:.1}%) ratio={:.3}",
591 target.len(),
592 bases_covered_by_matches,
593 target.len(),
594 coverage_pct,
595 compression_ratio
596 );
597
598 if match_count == 0 && target.len() > 40 {
600 let ref_first: String = self
601 .reference
602 .iter()
603 .take(20)
604 .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
605 .collect();
606 let ref_last: String = self.reference
607 [self.reference_len.saturating_sub(20)..self.reference_len]
608 .iter()
609 .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
610 .collect();
611 let tgt_first: String = target
612 .iter()
613 .take(20)
614 .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
615 .collect();
616 let tgt_last: String = target[target.len().saturating_sub(20)..]
617 .iter()
618 .map(|&b| if b < 4 { (b'A' + b) as char } else { 'N' })
619 .collect();
620 eprintln!(
621 "RAGC_ZERO_MATCH_DATA: ref_first={} ref_last={} tgt_first={} tgt_last={}",
622 ref_first, ref_last, tgt_first, tgt_last
623 );
624 let ref_rc_first: String = self.reference
626 [self.reference_len.saturating_sub(20)..self.reference_len]
627 .iter()
628 .rev()
629 .map(|&b| match b {
630 0 => 3,
631 1 => 2,
632 2 => 1,
633 3 => 0,
634 _ => b,
635 })
636 .map(|b| if b < 4 { (b'A' + b) as char } else { 'N' })
637 .collect();
638 if tgt_first == ref_rc_first {
639 eprintln!("RAGC_ZERO_MATCH_ORIENTATION: Target matches RC of reference - ORIENTATION MISMATCH DETECTED!");
640 }
641 }
642
643 if encoded.len() > target.len() {
645 eprintln!(
646 "RAGC_LZ_WARNING: encoded LARGER than target! {} vs {} bytes (+{})",
647 encoded.len(),
648 target.len(),
649 encoded.len() - target.len()
650 );
651 }
652 }
653
654 encoded
655 }
656
657 pub fn decode(&self, encoded: &[u8]) -> Vec<u8> {
659 let debug_decode = crate::env_cache::debug_lz_decode();
660 let mut decoded = Vec::new();
661 let mut pred_pos = 0usize;
662 let mut i = 0;
663 let mut op_count = 0;
664
665 while i < encoded.len() {
666 if self.is_literal(encoded[i]) {
667 let c = self.decode_literal(encoded[i]);
668 let actual_c = if c == b'!' {
669 self.reference[pred_pos]
670 } else {
671 c
672 };
673 decoded.push(actual_c);
674 if debug_decode && op_count < 10 {
675 eprintln!(" LZ_DECODE op={}: LITERAL byte={} c={} actual_c={} pred_pos={} decoded_len={}",
676 op_count, encoded[i], c, actual_c, pred_pos, decoded.len());
677 }
678 pred_pos += 1;
679 i += 1;
680 op_count += 1;
681 } else if encoded[i] == N_RUN_STARTER_CODE {
682 let (len, consumed) = self.decode_nrun(&encoded[i..]);
683 decoded.resize(decoded.len() + len as usize, N_CODE);
684 if debug_decode && op_count < 10 {
685 eprintln!(
686 " LZ_DECODE op={}: NRUN len={} consumed={} decoded_len={}",
687 op_count,
688 len,
689 consumed,
690 decoded.len()
691 );
692 }
693 i += consumed;
694 op_count += 1;
695 } else {
696 let (ref_pos, len, consumed) = self.decode_match(&encoded[i..], pred_pos);
698 let actual_len = if len == u32::MAX {
699 self.reference_len - ref_pos
701 } else {
702 len as usize
703 };
704 if debug_decode && op_count < 10 {
705 eprintln!(" LZ_DECODE op={}: MATCH ref_pos={} len={} actual_len={} consumed={} pred_pos={} ref_len={} decoded_len_before={}",
706 op_count, ref_pos, len, actual_len, consumed, pred_pos, self.reference_len, decoded.len());
707 }
708 decoded.extend_from_slice(&self.reference[ref_pos..ref_pos + actual_len]);
709 pred_pos = ref_pos + actual_len;
710 i += consumed;
711 op_count += 1;
712 }
713 }
714
715 if debug_decode {
716 eprintln!(
717 " LZ_DECODE: total_ops={} final_decoded_len={} ref_len={}",
718 op_count,
719 decoded.len(),
720 self.reference_len
721 );
722 }
723
724 if crate::env_cache::debug_lz_decode_full() && decoded.len() > self.reference_len + 50 {
726 eprintln!(
727 " LZ_DECODE_BLOAT: ref_len={} decoded_len={} encoded_len={}",
728 self.reference_len,
729 decoded.len(),
730 encoded.len()
731 );
732 eprintln!(
733 " Encoded first 100 bytes: {:?}",
734 &encoded[..encoded.len().min(100)]
735 );
736 }
737
738 decoded
739 }
740
741 fn is_literal(&self, c: u8) -> bool {
743 (b'A'..=b'A' + 20).contains(&c) || c == b'!'
744 }
745
746 fn decode_literal(&self, c: u8) -> u8 {
748 if c == b'!' {
749 b'!'
750 } else {
751 c - b'A'
752 }
753 }
754
755 fn decode_nrun(&self, data: &[u8]) -> (u32, usize) {
757 let mut i = 1; let (raw_len, len_bytes) = self.read_int(&data[i..]);
759 i += len_bytes;
760 i += 1; ((raw_len as u32) + MIN_NRUN_LEN, i)
762 }
763
764 fn decode_match(&self, data: &[u8], pred_pos: usize) -> (usize, u32, usize) {
767 let mut i = 0;
768 let (raw_pos, pos_bytes) = self.read_int(&data[i..]);
769 i += pos_bytes;
770
771 let ref_pos = ((pred_pos as i64) + raw_pos) as usize;
772
773 if i >= data.len() {
779 eprintln!("ERROR: decode_match - expected comma or period at position {} but data length is {}", i, data.len());
780 eprintln!(
781 " ref_pos={}, data prefix: {:?}",
782 ref_pos,
783 &data[..data.len().min(20)]
784 );
785 panic!("Malformed LZ match encoding: missing separator after position");
786 }
787
788 let len = if data[i] == b'.' {
789 i += 1; u32::MAX } else if data[i] == b',' {
793 i += 1; if i >= data.len() {
797 eprintln!(
798 "ERROR: decode_match - expected length at position {} but data length is {}",
799 i,
800 data.len()
801 );
802 eprintln!(
803 " ref_pos={}, data prefix: {:?}",
804 ref_pos,
805 &data[..data.len().min(20)]
806 );
807 panic!("Malformed LZ match encoding: missing length after comma");
808 }
809
810 let (raw_len, len_bytes) = self.read_int(&data[i..]);
811 i += len_bytes;
812 i += 1; (raw_len as u32) + self.min_match_len
814 } else {
815 eprintln!(
816 "ERROR: decode_match - unexpected character {} at position {}",
817 data[i], i
818 );
819 eprintln!(
820 " ref_pos={}, data prefix: {:?}",
821 ref_pos,
822 &data[..data.len().min(20)]
823 );
824 panic!("Malformed LZ match encoding: expected comma or period");
825 };
826
827 (ref_pos, len, i)
828 }
829
830 fn read_int(&self, data: &[u8]) -> (i64, usize) {
832 let mut i = 0;
833 let mut is_neg = false;
834
835 if data[i] == b'-' {
836 is_neg = true;
837 i += 1;
838 }
839
840 let mut x = 0i64;
841 while i < data.len() && data[i] >= b'0' && data[i] <= b'9' {
842 x = x * 10 + ((data[i] - b'0') as i64);
843 i += 1;
844 }
845
846 if is_neg {
847 x = -x;
848 }
849
850 (x, i)
851 }
852
853 pub fn get_coding_cost_vector(&self, target: &Contig, prefix_costs: bool) -> Vec<u32> {
858 let mut v_costs = Vec::with_capacity(target.len());
859
860 if self.reference.is_empty() {
861 return v_costs;
862 }
863
864 let text_size = target.len();
865 let mut i = 0;
866 let mut pred_pos = 0u32;
867 let mut no_prev_literals = 0usize;
868 let mut x_prev: Option<u64> = None;
869
870 while i + (self.key_len as usize) < text_size {
871 let x = if let Some(prev) = x_prev {
873 if no_prev_literals > 0 {
874 self.get_code_skip1(prev, &target[i..])
875 } else {
876 self.get_code(&target[i..])
877 }
878 } else {
879 self.get_code(&target[i..])
880 };
881
882 x_prev = x;
883
884 if x.is_none() {
885 let nrun_len = self.get_nrun_len(&target[i..], text_size - i);
887
888 if nrun_len >= MIN_NRUN_LEN {
889 let tc = self.coding_cost_nrun(nrun_len);
890 if prefix_costs {
891 v_costs.push(tc);
892 for _ in 1..nrun_len {
893 v_costs.push(0);
894 }
895 } else {
896 for _ in 1..nrun_len {
897 v_costs.push(0);
898 }
899 v_costs.push(tc);
900 }
901 i += nrun_len as usize;
902 no_prev_literals = 0;
903 } else {
904 v_costs.push(1);
906 i += 1;
907 pred_pos += 1;
908 no_prev_literals += 1;
909 }
910 continue;
911 }
912
913 let kmer_code = x.unwrap();
915 let hash = MurMur64Hash::hash(kmer_code);
916 let max_len = text_size - i;
917
918 if let Some((match_pos, len_bck, len_fwd)) =
919 self.find_best_match_lp(kmer_code, hash, target, i, max_len, no_prev_literals)
920 {
921 if len_bck > 0 {
923 for _ in 0..len_bck {
924 v_costs.pop();
925 }
926 i -= len_bck as usize;
927 pred_pos -= len_bck;
928 }
929
930 let total_len = len_bck + len_fwd;
931 let tc = self.coding_cost_match(match_pos - len_bck, total_len, pred_pos);
932
933 if prefix_costs {
934 v_costs.push(tc);
935 for _ in 1..total_len {
936 v_costs.push(0);
937 }
938 } else {
939 for _ in 1..total_len {
940 v_costs.push(0);
941 }
942 v_costs.push(tc);
943 }
944
945 pred_pos = match_pos - len_bck + total_len;
946 i += total_len as usize;
947 no_prev_literals = 0;
948 } else {
949 v_costs.push(1);
951 i += 1;
952 pred_pos += 1;
953 no_prev_literals += 1;
954 }
955 }
956
957 while i < text_size {
959 v_costs.push(1);
960 i += 1;
961 }
962
963 v_costs
964 }
965
966 fn int_len(x: u32) -> u32 {
968 if x < 10 {
969 1
970 } else if x < 100 {
971 2
972 } else if x < 1_000 {
973 3
974 } else if x < 10_000 {
975 4
976 } else if x < 100_000 {
977 5
978 } else if x < 1_000_000 {
979 6
980 } else if x < 10_000_000 {
981 7
982 } else if x < 100_000_000 {
983 8
984 } else if x < 1_000_000_000 {
985 9
986 } else {
987 10
988 }
989 }
990
991 fn coding_cost_nrun(&self, len: u32) -> u32 {
993 let delta = len - MIN_NRUN_LEN;
994 1 + Self::int_len(delta) + 1
996 }
997
998 fn coding_cost_match(&self, match_pos: u32, len: u32, pred_pos: u32) -> u32 {
1000 let dif_pos = (match_pos as i32) - (pred_pos as i32);
1001 let pos_digits = if dif_pos >= 0 {
1002 Self::int_len(dif_pos as u32)
1003 } else {
1004 Self::int_len((-dif_pos) as u32) + 1 };
1006
1007 let delta = len - self.min_match_len;
1008 let len_digits = Self::int_len(delta);
1009
1010 pos_digits + len_digits + 2 }
1012
1013 fn uint_len_v2(x: u32) -> u32 {
1015 if x < 10 {
1016 1
1017 } else if x < 100 {
1018 2
1019 } else if x < 1_000 {
1020 3
1021 } else if x < 10_000 {
1022 4
1023 } else if x < 100_000 {
1024 5
1025 } else if x < 1_000_000 {
1026 6
1027 } else if x < 10_000_000 {
1028 7
1029 } else {
1030 8
1031 }
1032 }
1033
1034 fn int_len_v2(x: i32) -> u32 {
1036 if x >= 0 {
1037 Self::uint_len_v2(x as u32)
1038 } else {
1039 1 + Self::uint_len_v2((-x) as u32)
1040 }
1041 }
1042
1043 fn cost_match_v2(&self, ref_pos: u32, len: u32, pred_pos: u32) -> u32 {
1046 let dif_pos = (ref_pos as i32) - (pred_pos as i32);
1047 let mut r = Self::int_len_v2(dif_pos);
1048
1049 if len != u32::MAX {
1050 r += 1 + Self::uint_len_v2(len - self.min_match_len);
1051 }
1052
1053 r + 1 }
1055
1056 fn cost_nrun_v2(len: u32) -> u32 {
1058 2 + Self::uint_len_v2(len - MIN_NRUN_LEN)
1059 }
1060
1061 pub fn estimate(&self, target: &Contig, bound: u32) -> u32 {
1071 if self.ht_lp.is_empty() {
1072 return target.len() as u32; }
1074
1075 let text_size = target.len() as u32;
1076
1077 if text_size == self.reference_len as u32 {
1079 if target
1080 .iter()
1081 .zip(self.reference.iter())
1082 .all(|(a, b)| a == b)
1083 {
1084 return 0; }
1086 }
1087
1088 let mut est_cost = 0u32;
1089 let mut i = 0u32;
1090 let mut pred_pos = 0u32;
1091 let mut no_prev_literals = 0u32;
1092 let mut x_prev: Option<u64> = None;
1093 let text_ptr = target.as_slice();
1094
1095 while (i + self.key_len) < text_size {
1096 if est_cost > bound {
1098 return est_cost;
1099 }
1100
1101 let x = if x_prev.is_some() && no_prev_literals > 0 {
1103 self.get_code_skip1(x_prev.unwrap(), &text_ptr[i as usize..])
1104 } else {
1105 self.get_code(&text_ptr[i as usize..])
1106 };
1107 x_prev = x;
1108
1109 if x.is_none() {
1110 let nrun_len = self.get_nrun_len(&text_ptr[i as usize..], (text_size - i) as usize);
1112
1113 if nrun_len >= MIN_NRUN_LEN {
1114 est_cost += Self::cost_nrun_v2(nrun_len);
1115 i += nrun_len;
1116 no_prev_literals = 0;
1117 } else {
1118 est_cost += 1;
1120 i += 1;
1121 pred_pos += 1;
1122 no_prev_literals += 1;
1123 }
1124 continue;
1125 }
1126
1127 let kmer_code = x.unwrap();
1129 let hash = MurMur64Hash::hash(kmer_code);
1130 let max_len = (text_size - i) as usize;
1131
1132 if let Some((match_pos, len_bck, len_fwd)) = self.find_best_match_lp(
1133 kmer_code,
1134 hash,
1135 target,
1136 i as usize,
1137 max_len,
1138 no_prev_literals as usize,
1139 ) {
1140 let total_len = len_bck + len_fwd;
1141 let is_end_match = (i + total_len) == text_size
1147 && (match_pos + total_len) as usize == self.reference_len;
1148
1149 if is_end_match {
1150 est_cost += self.cost_match_v2(match_pos, u32::MAX, pred_pos);
1151 } else {
1152 est_cost += self.cost_match_v2(match_pos, total_len, pred_pos);
1153 }
1154
1155 pred_pos = match_pos + total_len;
1157 i += total_len;
1158 no_prev_literals = 0;
1159 } else {
1160 est_cost += 1;
1162 i += 1;
1163 pred_pos += 1;
1164 no_prev_literals += 1;
1165 }
1166 }
1167
1168 est_cost += text_size - i;
1170
1171 est_cost
1172 }
1173}
1174
1175#[cfg(test)]
1176mod tests {
1177 use super::*;
1178
1179 #[test]
1180 fn test_simple_literal() {
1181 let reference = vec![0, 0, 0, 1, 1, 1];
1182 let target = vec![0, 1, 2, 3];
1183
1184 let mut lz = LZDiff::new(18);
1185 lz.prepare(&reference);
1186
1187 let encoded = lz.encode(&target);
1188 let decoded = lz.decode(&encoded);
1189
1190 assert_eq!(target, decoded);
1191 }
1192
1193 #[test]
1194 fn test_identical_sequences() {
1195 let reference = vec![0, 1, 2, 3, 0, 1, 2, 3];
1196 let target = reference.clone();
1197
1198 let mut lz = LZDiff::new(18);
1199 lz.prepare(&reference);
1200
1201 let encoded = lz.encode(&target);
1202 assert_eq!(encoded.len(), 0);
1204
1205 let decoded = if encoded.is_empty() && target.len() == reference.len() {
1207 reference.clone()
1208 } else {
1209 lz.decode(&encoded)
1210 };
1211
1212 assert_eq!(target, decoded);
1213 }
1214}