datacortex_core/model/
engine.rs

1//! CMEngine -- orchestrates all context models + mixer + APM.
2//!
3//! Phase 5+: Multi-output ContextMap engine.
4//!
5//! Each order model (O1-O9) now produces 3 predictions:
6//! - StateMap prediction (primary)
7//! - Run-count prediction (consecutive same-bit tracking)
8//! - Byte-history prediction (last byte seen in this context)
9//!
10//! Total mixer inputs: 1 (O0) + 9*3 (O1-O9) + 9 (other) = 37.
11//!
12//! Both presets use:
13//! - Triple logistic mixer (fine 64K + med 16K + coarse 4K)
14//! - 7-stage APM cascade
15//!
16//! Probability always in [1, 4095] -- clamped after every operation.
17//! CRITICAL: Encoder/decoder must use IDENTICAL prediction sequence.
18
19use crate::mixer::apm::APMStage;
20use crate::mixer::dual_mixer::{NUM_MODELS, byte_class};
21use crate::mixer::isse::IsseChain;
22use crate::mixer::multi_set_mixer::MultiSetMixer;
23use crate::model::cm_model::{AssociativeContextModel, ChecksumContextModel, ContextModel};
24use crate::model::dmc_model::DmcModel;
25use crate::model::indirect_model::IndirectModel;
26use crate::model::json_model::JsonModel;
27use crate::model::match_model::MatchModel;
28use crate::model::order0::Order0Model;
29use crate::model::ppm_model::{PpmConfig, PpmModel};
30use crate::model::run_model::RunModel;
31use crate::model::sparse_model::SparseModel;
32use crate::model::word_model::WordModel;
33
34/// Configuration for the CM engine -- controls memory/quality trade-off.
35///
36/// Each field specifies the ContextMap size in bytes for the corresponding model.
37/// Larger sizes reduce hash collisions and improve prediction accuracy at the
38/// cost of more memory and slightly slower cache performance.
39#[derive(Debug, Clone)]
40pub struct CMConfig {
41    /// Order-1 ContextMap size (default: 32MB).
42    pub order1_size: usize,
43    /// Order-2 ContextMap size (default: 16MB).
44    pub order2_size: usize,
45    /// Order-3 ChecksumContextMap size (default: 32MB).
46    pub order3_size: usize,
47    /// Order-4 ChecksumContextMap size (default: 32MB).
48    pub order4_size: usize,
49    /// Order-5 AssociativeContextMap size (default: 32MB).
50    pub order5_size: usize,
51    /// Order-6 AssociativeContextMap size (default: 16MB).
52    pub order6_size: usize,
53    /// Order-7 AssociativeContextMap size (default: 32MB).
54    pub order7_size: usize,
55    /// Order-8 AssociativeContextMap size (default: 32MB).
56    pub order8_size: usize,
57    /// Order-9 AssociativeContextMap size (default: 16MB).
58    pub order9_size: usize,
59    /// Match model ring buffer size in bytes (default: 16MB). Must be power of 2.
60    pub match_ring_size: usize,
61    /// Match model hash table entry count (default: 8M). Must be power of 2.
62    pub match_hash_size: usize,
63    /// Word model ContextMap size (default: 16MB).
64    pub word_size: usize,
65    /// Sparse model ContextMap size per gap context (default: 8MB, 16MB total).
66    pub sparse_size: usize,
67    /// Run model ContextMap size (default: 4MB).
68    pub run_size: usize,
69    /// JSON model ContextMap size (default: 8MB).
70    pub json_size: usize,
71    /// PPM model table sizes configuration.
72    pub ppm_config: PpmConfig,
73}
74
75impl CMConfig {
76    /// Balanced preset: production sizes (~256MB CM + ~360MB PPM).
77    pub fn balanced() -> Self {
78        CMConfig {
79            order1_size: 1 << 25,      // 32MB
80            order2_size: 1 << 24,      // 16MB
81            order3_size: 1 << 25,      // 32MB
82            order4_size: 1 << 25,      // 32MB
83            order5_size: 1 << 25,      // 32MB
84            order6_size: 1 << 24,      // 16MB
85            order7_size: 1 << 25,      // 32MB
86            order8_size: 1 << 25,      // 32MB
87            order9_size: 1 << 24,      // 16MB
88            match_ring_size: 16 << 20, // 16MB
89            match_hash_size: 8 << 20,  // 8M entries
90            word_size: 1 << 24,        // 16MB
91            sparse_size: 1 << 23,      // 8MB per gap (16MB total)
92            run_size: 1 << 22,         // 4MB
93            json_size: 1 << 23,        // 8MB
94            ppm_config: PpmConfig::scaled_4x(),
95        }
96    }
97
98    /// Max preset: 2x everything for better compression (~512MB CM + ~360MB PPM).
99    /// Doubles all ContextMap sizes, match ring, and hash table.
100    /// More context slots = fewer collisions = better predictions.
101    pub fn max() -> Self {
102        CMConfig {
103            order1_size: 1 << 26,      // 64MB
104            order2_size: 1 << 25,      // 32MB
105            order3_size: 1 << 26,      // 64MB
106            order4_size: 1 << 26,      // 64MB
107            order5_size: 1 << 26,      // 64MB
108            order6_size: 1 << 25,      // 32MB
109            order7_size: 1 << 26,      // 64MB
110            order8_size: 1 << 26,      // 64MB
111            order9_size: 1 << 25,      // 32MB
112            match_ring_size: 32 << 20, // 32MB
113            match_hash_size: 16 << 20, // 16M entries
114            word_size: 1 << 25,        // 32MB
115            sparse_size: 1 << 24,      // 16MB per gap (32MB total)
116            run_size: 1 << 23,         // 8MB
117            json_size: 1 << 24,        // 16MB
118            ppm_config: PpmConfig::scaled_4x(),
119        }
120    }
121}
122
123/// Context mixing engine -- orchestrates all models, mixer, and APM.
124pub struct CMEngine {
125    // --- Models ---
126    /// Order-0: 256-context partial byte predictor.
127    order0: Order0Model,
128    /// Order-1: previous byte + partial byte context. ContextMap 32MB.
129    order1: ContextModel,
130    /// Order-2: previous 2 bytes + partial byte context. ContextMap 16MB.
131    order2: ContextModel,
132    /// Order-3: previous 3 bytes + partial byte context. ChecksumContextMap 32MB.
133    order3: ChecksumContextModel,
134    /// Order-4: previous 4 bytes + partial byte context. ChecksumContextMap 32MB.
135    order4: ChecksumContextModel,
136    /// Order-5: previous 5 bytes + partial byte context. AssociativeContextMap 32MB.
137    order5: AssociativeContextModel,
138    /// Order-6: previous 6 bytes + partial byte context. AssociativeContextMap 16MB.
139    order6: AssociativeContextModel,
140    /// Order-7: previous 7 bytes + partial byte context. AssociativeContextMap 32MB.
141    order7: AssociativeContextModel,
142    /// Order-8: previous 8 bytes + partial byte context. AssociativeContextMap 32MB.
143    order8: AssociativeContextModel,
144    /// Order-9: previous 9 bytes + partial byte context. AssociativeContextMap 16MB.
145    order9: AssociativeContextModel,
146    /// Match model: ring buffer (16MB) + hash table (8M entries).
147    match_model: MatchModel,
148    /// Word model: word boundary context. ContextMap 16MB.
149    word_model: WordModel,
150    /// Sparse model: skip-byte context for periodic patterns. 8MB total.
151    sparse_model: SparseModel,
152    /// Run model: run-length context. 2MB.
153    run_model: RunModel,
154    /// JSON model: structure-aware context. 4MB.
155    json_model: JsonModel,
156    /// Indirect model: second-order context prediction. ~2MB.
157    indirect_model: IndirectModel,
158    /// PPM model: byte-level prediction by partial matching (PPMd Method D).
159    /// Different paradigm from CM — trie/hash-based, byte-level, adaptive order with exclusion.
160    ppm_model: PpmModel,
161    /// DMC model: bit-level dynamic Markov compression automaton.
162    /// State-cloning captures sub-byte and cross-byte patterns.
163    dmc_model: DmcModel,
164
165    // --- Mixer + APM ---
166    /// Triple logistic mixer (fine 64K + medium 16K + coarse 4K).
167    mixer: MultiSetMixer,
168    /// APM Stage 1: 2K contexts (c0 * bpos), 50% blend.
169    apm1: APMStage,
170    /// APM Stage 2: 16K contexts (c1 * bpos * byte_class), 25% blend.
171    apm2: APMStage,
172    /// APM Stage 3: 4K contexts (match_q * c1_top4 * bpos), 20% blend.
173    apm3: APMStage,
174    /// APM Stage 4: 4K contexts (byte_class pair transition + run), 15% blend.
175    apm4: APMStage,
176    /// APM Stage 5: 2K contexts (c3_top4 * c2_top4 * bpos), 12% blend.
177    /// Captures longer-range byte patterns (trigram character class).
178    apm5: APMStage,
179    /// APM Stage 6: 2K contexts (match_length_quantized * c1_class * bpos), 10% blend.
180    /// Match-dependent refinement.
181    apm6: APMStage,
182    /// APM Stage 7: 2K contexts (line_pos_q * bpos * c1_class), 10% blend.
183    /// Position-within-line aware refinement.
184    apm7: APMStage,
185    /// ISSE model: 3-level ICM→ISSE→ISSE chain used as model #37 in the mixer.
186    /// Provides a complementary ZPAQ-style cascaded prediction.
187    isse_model: IsseChain,
188
189    // --- Context state ---
190    /// Partial byte being decoded (1-255). Starts at 1.
191    c0: u32,
192    /// Last completed byte.
193    c1: u8,
194    /// Second-to-last completed byte.
195    c2: u8,
196    /// Third-to-last completed byte.
197    c3: u8,
198    /// Fourth-to-last completed byte.
199    c4: u8,
200    /// Fifth-to-last completed byte.
201    c5: u8,
202    /// Sixth-to-last completed byte.
203    c6: u8,
204    /// Seventh-to-last completed byte.
205    c7: u8,
206    /// Eighth-to-last completed byte.
207    c8: u8,
208    /// Ninth-to-last completed byte.
209    c9: u8,
210    /// Bit position within current byte (0-7).
211    bpos: u8,
212    /// Byte-level run length (consecutive identical bytes).
213    run_len: u8,
214    /// Distance since last newline (quantized).
215    line_pos: u16,
216    /// Column index in columnar data (incremented on \x00 separator).
217    /// For raw (non-columnar) data, stays at 0. For columnar data, provides
218    /// column identity as additional context for prediction.
219    column_index: u16,
220}
221
222impl CMEngine {
223    /// Create a new CM engine with balanced (default) configuration.
224    pub fn new() -> Self {
225        Self::with_config(CMConfig::balanced())
226    }
227
228    /// Create a CM engine with a specific configuration.
229    pub fn with_config(config: CMConfig) -> Self {
230        CMEngine {
231            order0: Order0Model::new(),
232            order1: ContextModel::new(config.order1_size),
233            order2: ContextModel::new(config.order2_size),
234            order3: ChecksumContextModel::new(config.order3_size),
235            order4: ChecksumContextModel::new(config.order4_size),
236            order5: AssociativeContextModel::new(config.order5_size),
237            order6: AssociativeContextModel::new(config.order6_size),
238            order7: AssociativeContextModel::new(config.order7_size),
239            order8: AssociativeContextModel::new(config.order8_size),
240            order9: AssociativeContextModel::new(config.order9_size),
241            match_model: MatchModel::with_sizes(config.match_ring_size, config.match_hash_size),
242            word_model: WordModel::with_size(config.word_size),
243            sparse_model: SparseModel::with_size(config.sparse_size),
244            run_model: RunModel::with_size(config.run_size),
245            json_model: JsonModel::with_size(config.json_size),
246            indirect_model: IndirectModel::new(),
247            ppm_model: PpmModel::with_config(config.ppm_config),
248            dmc_model: DmcModel::new_single(),
249            mixer: MultiSetMixer::new(),
250            apm1: APMStage::new(2048, 55),  // c0(256) * bpos(8) = 2048
251            apm2: APMStage::new(16384, 30), // c1*bpos*byte_class = 256*8*8 = 16K
252            apm3: APMStage::new(4096, 25), // match_q(4) * c2_top2(4) * c1_top4(16) * bpos(8) = 2048, use 4K
253            apm4: APMStage::new(4096, 15), // bclass(8) * bc2(8) * bpos(8) * run_q(4) -> 4K
254            apm5: APMStage::new(4096, 15), // c3_top4(16) * c2_top4(16) * bpos(8) -> 2K mapped to 4K
255            apm6: APMStage::new(2048, 12), // match_q(4) * c1_class(8) * bpos(8) -> 256 mapped to 2K
256            apm7: APMStage::new(4096, 12), // line_pos_q(4) * bpos(8) * c1_class(8) -> 256 mapped to 4K
257            isse_model: IsseChain::new(),
258            c0: 1,
259            c1: 0,
260            c2: 0,
261            c3: 0,
262            c4: 0,
263            c5: 0,
264            c6: 0,
265            c7: 0,
266            c8: 0,
267            c9: 0,
268            bpos: 0,
269            run_len: 0,
270            line_pos: 0,
271            column_index: 0,
272        }
273    }
274
275    /// Predict probability of the next bit being 1.
276    /// Returns 12-bit probability in [1, 4095].
277    ///
278    /// CRITICAL: encoder and decoder must call this with identical state.
279    #[inline(always)]
280    pub fn predict(&mut self) -> u32 {
281        // --- Gather predictions from each model ---
282        let c0 = self.c0;
283        let c1 = self.c1;
284        let c2 = self.c2;
285        let c3 = self.c3;
286        let c4 = self.c4;
287        let c5 = self.c5;
288        let c6 = self.c6;
289        let c7 = self.c7;
290        let bpos = self.bpos;
291
292        // Order-0: context is the partial byte. Single prediction.
293        let p0 = self.order0.predict(c0 as usize);
294
295        // Order-1 through Order-9: each produces (state_p, run_p).
296        let h1 = order1_hash(c1, c0);
297        let (p1_s, p1_r) = self.order1.predict_multi(h1);
298
299        let h2 = order2_hash(c2, c1, c0);
300        let (p2_s, p2_r) = self.order2.predict_multi(h2);
301
302        let h3 = order3_hash(c3, c2, c1, c0);
303        let (p3_s, p3_r) = self.order3.predict_multi(h3);
304
305        let h4 = order4_hash(c4, c3, c2, c1, c0);
306        let (p4_s, p4_r) = self.order4.predict_multi(h4);
307
308        let h5 = order5_hash(c5, c4, c3, c2, c1, c0);
309        let (p5_s, p5_r) = self.order5.predict_multi(h5);
310
311        let h6 = order6_hash(c6, c5, c4, c3, c2, c1, c0);
312        let (p6_s, p6_r) = self.order6.predict_multi(h6);
313
314        let h7 = order7_hash(c7, c6, c5, c4, c3, c2, c1, c0);
315        let (p7_s, p7_r) = self.order7.predict_multi(h7);
316
317        let c8 = self.c8;
318        let h8 = order8_hash(c8, c7, c6, c5, c4, c3, c2, c1, c0);
319        let (p8_s, p8_r) = self.order8.predict_multi(h8);
320
321        let c9 = self.c9;
322        let h9 = order9_hash(c9, c8, c7, c6, c5, c4, c3, c2, c1, c0);
323        let (p9_s, p9_r) = self.order9.predict_multi(h9);
324
325        // Match model.
326        let p_match = self.match_model.predict(c0, bpos, c1, c2, c3);
327
328        // Word model.
329        let p_word = self.word_model.predict(c0, bpos, c1);
330
331        // Sparse model (skip-byte patterns).
332        let p_sparse = self.sparse_model.predict(c0, c1, c2, c3);
333
334        // Run model (run-length patterns).
335        let p_run = self.run_model.predict(c0, bpos, c1);
336
337        // JSON model (structure-aware).
338        let p_json = self.json_model.predict(c0, bpos, c1);
339
340        // Indirect model (second-order context).
341        let p_indirect = self.indirect_model.predict(c0, bpos, c1);
342
343        // PPM model (byte-level prediction by partial matching).
344        // PPM updates at byte level; here we just convert cached byte probs to bit prediction.
345        let p_ppm = self.ppm_model.predict_bit(bpos, c0);
346
347        // DMC model (bit-level dynamic Markov compression).
348        let p_dmc = self.dmc_model.predict();
349
350        // ISSE model (3-level ICM→ISSE→ISSE chain with cross-context hashes).
351        let p_isse = self.isse_model.predict(c0, c1, c2, c3, bpos);
352
353        // --- Mix (28 models) ---
354        // Layout: [O0, O1_s, O1_r, O2_s, O2_r, ..., O9_s, O9_r,
355        //          match, word, sparse, run, json, indirect, ppm, dmc, isse]
356        let predictions: [u32; NUM_MODELS] = [
357            p0, p1_s, p1_r, p2_s, p2_r, p3_s, p3_r, p4_s, p4_r, p5_s, p5_r, p6_s, p6_r, p7_s, p7_r,
358            p8_s, p8_r, p9_s, p9_r, p_match, p_word, p_sparse, p_run, p_json, p_indirect, p_ppm,
359            p_dmc, p_isse,
360        ];
361        let bclass = byte_class(c1);
362        let match_q = self.match_model.match_length_quantized();
363        let run_q = quantize_run_for_mixer(self.run_len);
364
365        let mixed = self
366            .mixer
367            .predict(&predictions, c0, c1, c2, bpos, bclass, match_q, run_q, 0);
368
369        // --- APM cascade ---
370        // Stage 1: context = c0_partial(8b) + bpos(3b) + run_q(2b) = 13 bits -> 2048 contexts (folded).
371        let apm1_ctx = (((c0 as usize & 0xFF) << 3) | bpos as usize)
372            .wrapping_mul(5)
373            .wrapping_add(run_q as usize & 0x3)
374            & 2047;
375        let after_apm1 = self.apm1.predict(mixed, apm1_ctx);
376
377        // Stage 2: context = c1(8b) * bpos(3b) + byte_class(3b) + c2_top4(4b).
378        let apm2_ctx = (((c1 as usize) << 3 | bpos as usize) * 8 + bclass as usize)
379            .wrapping_mul(17)
380            .wrapping_add(c2 as usize >> 4)
381            & 16383;
382        let after_apm2 = self.apm2.predict(after_apm1, apm2_ctx);
383
384        // Stage 3: context = match_q(2b) * c1_top4(4b) * bpos(3b) * c2_top2(2b) + match_len_q(2b).
385        let apm3_ctx = ((match_q as usize * 512)
386            + ((c2 as usize >> 6) << 7)
387            + ((c1 as usize >> 4) << 3)
388            + bpos as usize)
389            .wrapping_mul(5)
390            .wrapping_add(match_q as usize)
391            & 4095;
392        let after_apm3 = self.apm3.predict(after_apm2, apm3_ctx);
393
394        // Stage 4 (Neural APM): byte-class pair transition + run context.
395        // Uses byte_class(c1) x byte_class(c2) x bpos x run_q as context.
396        // This captures character-class transitions and run patterns that
397        // other APM stages miss.
398        let bc2 = byte_class(c2);
399        let apm4_ctx = (bclass as usize * 8 + bc2 as usize)
400            .wrapping_mul(33)
401            .wrapping_add(bpos as usize * 4 + run_q as usize)
402            & 4095;
403        let after_apm4 = self.apm4.predict(after_apm3, apm4_ctx);
404
405        // Stage 5: Longer-range byte pattern context.
406        // (c3_top4, c2_top4, c1_top2, bpos) — captures trigram character patterns.
407        let apm5_ctx = ((c3 as usize >> 4).wrapping_mul(67) + (c2 as usize >> 4))
408            .wrapping_mul(67)
409            .wrapping_add((c1 as usize >> 6) * 8 + bpos as usize)
410            & 4095;
411        let after_apm5 = self.apm5.predict(after_apm4, apm5_ctx);
412
413        // Stage 6: Match-dependent refinement (match_q, c1_class, bpos).
414        // When a match is active, this helps the APM adapt to match confidence.
415        let apm6_ctx = (match_q as usize * 64 + bclass as usize * 8 + bpos as usize) & 2047;
416        let after_apm6 = self.apm6.predict(after_apm5, apm6_ctx);
417
418        // Stage 7: Position/column-aware context.
419        // For columnar data: column_index provides column identity (line_pos stays 0).
420        // For raw data: line_pos provides position-within-line (column_index stays 0).
421        // Combined: exactly one is active, providing the right signal for each format.
422        let line_pos_q = quantize_line_pos(self.line_pos);
423        let pos_ctx = (line_pos_q as usize) ^ ((self.column_index as usize & 0xF) << 2);
424        let apm7_ctx = (pos_ctx.wrapping_mul(67) + (c0 as usize & 0xFF))
425            .wrapping_mul(67)
426            .wrapping_add(bpos as usize)
427            & 4095;
428        let final_p = self.apm7.predict(after_apm6, apm7_ctx);
429
430        final_p.clamp(1, 4095)
431    }
432
433    /// Update all models after observing `bit`.
434    ///
435    /// CRITICAL: encoder and decoder must call this with identical state and bit.
436    #[inline(always)]
437    pub fn update(&mut self, bit: u8) {
438        // Update APM (reverse order: stage 7 first, then 6, 5, 4, 3, 2, 1).
439        self.apm7.update(bit);
440        self.apm6.update(bit);
441        self.apm5.update(bit);
442        self.apm4.update(bit);
443        self.apm3.update(bit);
444        self.apm2.update(bit);
445        self.apm1.update(bit);
446
447        // Update mixer.
448        self.mixer.update(bit);
449
450        // Update all models.
451        self.order0.update(self.c0 as usize, bit);
452        self.order1.update(bit);
453        self.order2.update(bit);
454        self.order3.update(bit);
455        self.order4.update(bit);
456        self.order5.update(bit);
457        self.order6.update(bit);
458        self.order7.update(bit);
459        self.order8.update(bit);
460        self.order9.update(bit);
461        self.match_model
462            .update(bit, self.bpos, self.c0, self.c1, self.c2);
463        self.word_model.update(bit);
464        self.sparse_model.update(bit);
465        self.run_model.update(bit);
466        self.json_model.update(bit);
467        self.indirect_model.update(bit);
468        self.dmc_model.update(bit);
469        self.isse_model.update(bit, self.c0, self.bpos);
470
471        // Advance context state.
472        self.c0 = (self.c0 << 1) | bit as u32;
473        self.bpos += 1;
474
475        if self.bpos >= 8 {
476            // Byte complete. Extract byte value and reset.
477            let byte = (self.c0 & 0xFF) as u8;
478            // Track run length.
479            if byte == self.c1 {
480                self.run_len = self.run_len.saturating_add(1);
481            } else {
482                self.run_len = 1;
483            }
484            // Track line position.
485            if byte == b'\n' {
486                self.line_pos = 0;
487            } else {
488                self.line_pos = self.line_pos.saturating_add(1);
489            }
490            // Track column index for columnar data (\x00 = column separator).
491            if byte == 0x00 {
492                self.column_index = self.column_index.wrapping_add(1);
493            }
494
495            // Update PPM model at byte level (NOT per-bit).
496            self.ppm_model.update_byte(byte);
497
498            // Notify DMC model of byte completion (for state reset to byte context).
499            self.dmc_model.on_byte_complete(byte);
500
501            self.c9 = self.c8;
502            self.c8 = self.c7;
503            self.c7 = self.c6;
504            self.c6 = self.c5;
505            self.c5 = self.c4;
506            self.c4 = self.c3;
507            self.c3 = self.c2;
508            self.c2 = self.c1;
509            self.c1 = byte;
510            self.c0 = 1; // reset partial byte
511            self.bpos = 0;
512        }
513    }
514}
515
516impl Default for CMEngine {
517    fn default() -> Self {
518        Self::new()
519    }
520}
521
522// --- Context hash functions ---
523// Use FNV-1a style hashing with different seeds per order for speed
524// and reasonable distribution. The seed ensures different orders produce
525// different hashes even with overlapping byte sequences.
526
527/// FNV-1a offset basis.
528const FNV_OFFSET: u32 = 0x811C9DC5;
529/// FNV-1a prime.
530const FNV_PRIME: u32 = 0x01000193;
531
532/// Order-1 context hash: combines last byte with partial byte.
533#[inline]
534fn order1_hash(c1: u8, c0: u32) -> u32 {
535    let mut h = FNV_OFFSET;
536    h ^= c1 as u32;
537    h = h.wrapping_mul(FNV_PRIME);
538    h ^= c0 & 0xFF;
539    h = h.wrapping_mul(FNV_PRIME);
540    h
541}
542
543/// Order-2 context hash: combines last 2 bytes with partial byte.
544#[inline]
545fn order2_hash(c2: u8, c1: u8, c0: u32) -> u32 {
546    let mut h = FNV_OFFSET;
547    h ^= c2 as u32;
548    h = h.wrapping_mul(FNV_PRIME);
549    h ^= c1 as u32;
550    h = h.wrapping_mul(FNV_PRIME);
551    h ^= c0 & 0xFF;
552    h = h.wrapping_mul(FNV_PRIME);
553    h
554}
555
556/// Order-3 context hash: combines last 3 bytes with partial byte.
557#[inline]
558fn order3_hash(c3: u8, c2: u8, c1: u8, c0: u32) -> u32 {
559    let mut h = FNV_OFFSET;
560    h ^= c3 as u32;
561    h = h.wrapping_mul(FNV_PRIME);
562    h ^= c2 as u32;
563    h = h.wrapping_mul(FNV_PRIME);
564    h ^= c1 as u32;
565    h = h.wrapping_mul(FNV_PRIME);
566    h ^= c0 & 0xFF;
567    h = h.wrapping_mul(FNV_PRIME);
568    h
569}
570
571/// Order-4 context hash: combines last 4 bytes with partial byte.
572#[inline]
573fn order4_hash(c4: u8, c3: u8, c2: u8, c1: u8, c0: u32) -> u32 {
574    let mut h = FNV_OFFSET;
575    h ^= c4 as u32;
576    h = h.wrapping_mul(FNV_PRIME);
577    h ^= c3 as u32;
578    h = h.wrapping_mul(FNV_PRIME);
579    h ^= c2 as u32;
580    h = h.wrapping_mul(FNV_PRIME);
581    h ^= c1 as u32;
582    h = h.wrapping_mul(FNV_PRIME);
583    h ^= c0 & 0xFF;
584    h = h.wrapping_mul(FNV_PRIME);
585    h
586}
587
588/// Order-5 context hash: combines last 5 bytes with partial byte.
589#[inline]
590fn order5_hash(c5: u8, c4: u8, c3: u8, c2: u8, c1: u8, c0: u32) -> u32 {
591    let mut h = FNV_OFFSET;
592    h ^= c5 as u32;
593    h = h.wrapping_mul(FNV_PRIME);
594    h ^= c4 as u32;
595    h = h.wrapping_mul(FNV_PRIME);
596    h ^= c3 as u32;
597    h = h.wrapping_mul(FNV_PRIME);
598    h ^= c2 as u32;
599    h = h.wrapping_mul(FNV_PRIME);
600    h ^= c1 as u32;
601    h = h.wrapping_mul(FNV_PRIME);
602    h ^= c0 & 0xFF;
603    h = h.wrapping_mul(FNV_PRIME);
604    h
605}
606
607/// Order-6 context hash: combines last 6 bytes with partial byte.
608#[inline]
609fn order6_hash(c6: u8, c5: u8, c4: u8, c3: u8, c2: u8, c1: u8, c0: u32) -> u32 {
610    let mut h = FNV_OFFSET;
611    h ^= c6 as u32;
612    h = h.wrapping_mul(FNV_PRIME);
613    h ^= c5 as u32;
614    h = h.wrapping_mul(FNV_PRIME);
615    h ^= c4 as u32;
616    h = h.wrapping_mul(FNV_PRIME);
617    h ^= c3 as u32;
618    h = h.wrapping_mul(FNV_PRIME);
619    h ^= c2 as u32;
620    h = h.wrapping_mul(FNV_PRIME);
621    h ^= c1 as u32;
622    h = h.wrapping_mul(FNV_PRIME);
623    h ^= c0 & 0xFF;
624    h = h.wrapping_mul(FNV_PRIME);
625    h
626}
627
628/// Order-7 context hash: combines last 7 bytes with partial byte.
629#[inline]
630#[allow(clippy::too_many_arguments)]
631fn order7_hash(c7: u8, c6: u8, c5: u8, c4: u8, c3: u8, c2: u8, c1: u8, c0: u32) -> u32 {
632    let mut h = FNV_OFFSET;
633    h ^= c7 as u32;
634    h = h.wrapping_mul(FNV_PRIME);
635    h ^= c6 as u32;
636    h = h.wrapping_mul(FNV_PRIME);
637    h ^= c5 as u32;
638    h = h.wrapping_mul(FNV_PRIME);
639    h ^= c4 as u32;
640    h = h.wrapping_mul(FNV_PRIME);
641    h ^= c3 as u32;
642    h = h.wrapping_mul(FNV_PRIME);
643    h ^= c2 as u32;
644    h = h.wrapping_mul(FNV_PRIME);
645    h ^= c1 as u32;
646    h = h.wrapping_mul(FNV_PRIME);
647    h ^= c0 & 0xFF;
648    h = h.wrapping_mul(FNV_PRIME);
649    h
650}
651
652/// Order-8 context hash: combines last 8 bytes with partial byte.
653#[inline]
654#[allow(clippy::too_many_arguments)]
655fn order8_hash(c8: u8, c7: u8, c6: u8, c5: u8, c4: u8, c3: u8, c2: u8, c1: u8, c0: u32) -> u32 {
656    let mut h = FNV_OFFSET;
657    h ^= c8 as u32;
658    h = h.wrapping_mul(FNV_PRIME);
659    h ^= c7 as u32;
660    h = h.wrapping_mul(FNV_PRIME);
661    h ^= c6 as u32;
662    h = h.wrapping_mul(FNV_PRIME);
663    h ^= c5 as u32;
664    h = h.wrapping_mul(FNV_PRIME);
665    h ^= c4 as u32;
666    h = h.wrapping_mul(FNV_PRIME);
667    h ^= c3 as u32;
668    h = h.wrapping_mul(FNV_PRIME);
669    h ^= c2 as u32;
670    h = h.wrapping_mul(FNV_PRIME);
671    h ^= c1 as u32;
672    h = h.wrapping_mul(FNV_PRIME);
673    h ^= c0 & 0xFF;
674    h = h.wrapping_mul(FNV_PRIME);
675    h
676}
677
678/// Order-9 context hash: combines last 9 bytes with partial byte.
679#[inline]
680#[allow(clippy::too_many_arguments)]
681fn order9_hash(
682    c9: u8,
683    c8: u8,
684    c7: u8,
685    c6: u8,
686    c5: u8,
687    c4: u8,
688    c3: u8,
689    c2: u8,
690    c1: u8,
691    c0: u32,
692) -> u32 {
693    let mut h = FNV_OFFSET;
694    h ^= c9 as u32;
695    h = h.wrapping_mul(FNV_PRIME);
696    h ^= c8 as u32;
697    h = h.wrapping_mul(FNV_PRIME);
698    h ^= c7 as u32;
699    h = h.wrapping_mul(FNV_PRIME);
700    h ^= c6 as u32;
701    h = h.wrapping_mul(FNV_PRIME);
702    h ^= c5 as u32;
703    h = h.wrapping_mul(FNV_PRIME);
704    h ^= c4 as u32;
705    h = h.wrapping_mul(FNV_PRIME);
706    h ^= c3 as u32;
707    h = h.wrapping_mul(FNV_PRIME);
708    h ^= c2 as u32;
709    h = h.wrapping_mul(FNV_PRIME);
710    h ^= c1 as u32;
711    h = h.wrapping_mul(FNV_PRIME);
712    h ^= c0 & 0xFF;
713    h = h.wrapping_mul(FNV_PRIME);
714    h
715}
716
717/// Quantize run length to 0-3 for mixer context.
718#[inline]
719fn quantize_run_for_mixer(run_len: u8) -> u8 {
720    match run_len {
721        0..=1 => 0,
722        2..=3 => 1,
723        4..=8 => 2,
724        _ => 3,
725    }
726}
727
728/// Quantize line position to 0-3 for APM context.
729#[inline]
730fn quantize_line_pos(line_pos: u16) -> u8 {
731    match line_pos {
732        0..=3 => 0,   // start of line (indentation)
733        4..=15 => 1,  // early in line
734        16..=63 => 2, // mid-line
735        _ => 3,       // late in line
736    }
737}
738
739#[cfg(test)]
740mod tests {
741    use super::*;
742
743    #[test]
744    fn initial_prediction_is_balanced() {
745        let mut engine = CMEngine::new();
746        let p = engine.predict();
747        // Should be near 2048 (all models start balanced).
748        assert!(
749            (1800..=2200).contains(&p),
750            "initial prediction should be near 2048, got {p}"
751        );
752    }
753
754    #[test]
755    fn prediction_always_in_range() {
756        let mut engine = CMEngine::new();
757        let data = b"Hello, World! This is a test of the CM engine.";
758        for &byte in data {
759            for bpos in 0..8 {
760                let p = engine.predict();
761                assert!(
762                    (1..=4095).contains(&p),
763                    "prediction out of range at bpos {bpos}: {p}"
764                );
765                let bit = (byte >> (7 - bpos)) & 1;
766                engine.update(bit);
767            }
768        }
769    }
770
771    #[test]
772    fn context_state_tracks_correctly() {
773        let mut engine = CMEngine::new();
774        // Feed byte 0x42 (01000010)
775        let byte: u8 = 0x42;
776        for bpos in 0..8 {
777            let _p = engine.predict();
778            let bit = (byte >> (7 - bpos)) & 1;
779            engine.update(bit);
780        }
781        // After byte complete, c1 should be 0x42, c0 should be 1.
782        assert_eq!(engine.c1, 0x42);
783        assert_eq!(engine.c0, 1);
784        assert_eq!(engine.bpos, 0);
785    }
786
787    #[test]
788    fn repeated_byte_adapts() {
789        let mut engine = CMEngine::new();
790        let byte: u8 = b'A';
791        let mut total_bits: f64 = 0.0;
792        let mut first_byte_bits: f64 = 0.0;
793
794        for iteration in 0..50 {
795            let mut byte_bits: f64 = 0.0;
796            for bpos in 0..8 {
797                let p = engine.predict();
798                let bit = (byte >> (7 - bpos)) & 1;
799                let prob_of_bit = if bit == 1 {
800                    p as f64 / 4096.0
801                } else {
802                    1.0 - p as f64 / 4096.0
803                };
804                byte_bits += -prob_of_bit.max(0.001).log2();
805                engine.update(bit);
806            }
807            if iteration == 0 {
808                first_byte_bits = byte_bits;
809            }
810            total_bits += byte_bits;
811        }
812
813        let avg = total_bits / 50.0;
814        assert!(
815            avg < first_byte_bits,
816            "engine should improve: first={first_byte_bits:.2}, avg={avg:.2}"
817        );
818    }
819
820    #[test]
821    fn hash_functions_differ() {
822        let h1 = order1_hash(65, 1);
823        let h2 = order2_hash(0, 65, 1);
824        let h3 = order3_hash(0, 0, 65, 1);
825        // All should produce different hashes.
826        assert_ne!(h1, h2);
827        assert_ne!(h2, h3);
828    }
829
830    #[test]
831    fn engine_deterministic() {
832        // Two engines with same input must produce same predictions.
833        let data = b"determinism test";
834        let mut e1 = CMEngine::new();
835        let mut e2 = CMEngine::new();
836
837        for &byte in data {
838            for bpos in 0..8 {
839                let p1 = e1.predict();
840                let p2 = e2.predict();
841                assert_eq!(p1, p2, "engines diverged at bpos {bpos}");
842                let bit = (byte >> (7 - bpos)) & 1;
843                e1.update(bit);
844                e2.update(bit);
845            }
846        }
847    }
848}
datacortex_core/model/engine.rs

datacortex_core/model/
engine.rs