Skip to main content

runar_compiler_rust/codegen/
blake3.rs

1//! BLAKE3 compression codegen for Bitcoin Script.
2//!
3//! Port of packages/runar-compiler/src/passes/blake3-codegen.ts.
4//!
5//! emit_blake3_compress: [chainingValue(32 BE), block(64 BE)] -> [hash(32 BE)]
6//! emit_blake3_hash:     [message(<=64 BE)]                   -> [hash(32 BE)]
7//!
8//! Architecture (same as sha256.rs):
9//!   - All 32-bit words stored as 4-byte little-endian during computation.
10//!   - LE additions via BIN2NUM/NUM2BIN (13 ops per add32).
11//!   - Byte-aligned rotations (16, 8) via SPLIT/SWAP/CAT on LE (4 ops).
12//!   - Non-byte-aligned rotations (12, 7) via LE->BE->rotrBE->BE->LE (31 ops).
13//!   - BE<->LE conversion only at input unpack and output pack.
14//!
15//! Stack layout during rounds:
16//!   [m0..m15, v0..v15]  (all LE 4-byte values)
17//!   v15 at TOS (depth 0), v0 at depth 15, m15 at depth 16, m0 at depth 31.
18
19use super::stack::{PushValue, StackOp};
20
21use std::sync::OnceLock;
22
23// =========================================================================
24// BLAKE3 constants
25// =========================================================================
26
27const BLAKE3_IV: [u32; 8] = [
28    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
29    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
30];
31
32const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];
33
34// Flags
35const CHUNK_START: u32 = 1;
36const CHUNK_END: u32 = 2;
37const ROOT: u32 = 8;
38
39// =========================================================================
40// Helper: encode u32 as 4-byte little-endian
41// =========================================================================
42
43fn u32_to_le(n: u32) -> Vec<u8> {
44    vec![
45        (n & 0xff) as u8,
46        ((n >> 8) & 0xff) as u8,
47        ((n >> 16) & 0xff) as u8,
48        ((n >> 24) & 0xff) as u8,
49    ]
50}
51
52fn u32_to_be(n: u32) -> Vec<u8> {
53    vec![
54        ((n >> 24) & 0xff) as u8,
55        ((n >> 16) & 0xff) as u8,
56        ((n >> 8) & 0xff) as u8,
57        (n & 0xff) as u8,
58    ]
59}
60
61// =========================================================================
62// Precompute message schedule for all 7 rounds
63// =========================================================================
64
65/// For each round, compute which original message word index is used at each
66/// position. Returns msg_schedule[round][position] = original msg word index.
67fn compute_msg_schedule() -> [[usize; 16]; 7] {
68    let mut current: [usize; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
69    let mut schedule = [[0usize; 16]; 7];
70    for round in 0..7 {
71        schedule[round] = current;
72        let mut next = [0usize; 16];
73        for i in 0..16 {
74            next[i] = current[MSG_PERMUTATION[i]];
75        }
76        current = next;
77    }
78    schedule
79}
80
81// =========================================================================
82// Emitter with depth tracking
83// =========================================================================
84
85struct Emitter {
86    ops: Vec<StackOp>,
87    depth: i64,
88    alt_depth: i64,
89}
90
91impl Emitter {
92    fn new(initial_depth: i64) -> Self {
93        Emitter {
94            ops: Vec::new(),
95            depth: initial_depth,
96            alt_depth: 0,
97        }
98    }
99
100    fn e_raw(&mut self, sop: StackOp) {
101        self.ops.push(sop);
102    }
103
104    fn oc(&mut self, code: &str) {
105        self.ops.push(StackOp::Opcode(code.to_string()));
106    }
107
108    fn push_i(&mut self, v: i128) {
109        self.ops.push(StackOp::Push(PushValue::Int(v)));
110        self.depth += 1;
111    }
112
113    fn push_b(&mut self, v: Vec<u8>) {
114        self.ops.push(StackOp::Push(PushValue::Bytes(v)));
115        self.depth += 1;
116    }
117
118    fn dup(&mut self) {
119        self.ops.push(StackOp::Dup);
120        self.depth += 1;
121    }
122
123    fn drop(&mut self) {
124        self.ops.push(StackOp::Drop);
125        self.depth -= 1;
126    }
127
128    fn swap(&mut self) {
129        self.ops.push(StackOp::Swap);
130    }
131
132    fn over(&mut self) {
133        self.ops.push(StackOp::Over);
134        self.depth += 1;
135    }
136
137    fn rot(&mut self) {
138        self.ops.push(StackOp::Rot);
139    }
140
141    fn pick(&mut self, d: usize) {
142        if d == 0 {
143            self.dup();
144            return;
145        }
146        if d == 1 {
147            self.over();
148            return;
149        }
150        self.push_i(d as i128);
151        self.ops.push(StackOp::Pick { depth: d });
152        // push_i added 1, pick removes the depth literal but adds the picked value = net 0
153    }
154
155    fn roll(&mut self, d: usize) {
156        if d == 0 {
157            return;
158        }
159        if d == 1 {
160            self.swap();
161            return;
162        }
163        if d == 2 {
164            self.rot();
165            return;
166        }
167        self.push_i(d as i128);
168        self.ops.push(StackOp::Roll { depth: d });
169        self.depth -= 1; // push_i added 1, roll removes depth literal and item = net -1
170    }
171
172    fn to_alt(&mut self) {
173        self.oc("OP_TOALTSTACK");
174        self.depth -= 1;
175        self.alt_depth += 1;
176    }
177
178    fn from_alt(&mut self) {
179        self.oc("OP_FROMALTSTACK");
180        self.depth += 1;
181        self.alt_depth -= 1;
182    }
183
184    fn bin_op(&mut self, code: &str) {
185        self.oc(code);
186        self.depth -= 1;
187    }
188
189    fn uni_op(&mut self, code: &str) {
190        self.oc(code);
191    }
192
193    fn split(&mut self) {
194        self.oc("OP_SPLIT");
195        // splits: consumes 2 (value + position), produces 2 = net 0
196    }
197
198    fn split4(&mut self) {
199        self.push_i(4);
200        self.split();
201    }
202
203    fn assert_depth(&self, expected: i64, msg: &str) {
204        assert_eq!(
205            self.depth, expected,
206            "BLAKE3 codegen: {}. Expected depth {}, got {}",
207            msg, expected, self.depth
208        );
209    }
210
211    // --- Byte reversal (only for BE<->LE conversion at boundaries) ---
212
213    /// Reverse 4 bytes on TOS: [abcd] -> [dcba]. Net: 0. 12 ops.
214    fn reverse_bytes4(&mut self) {
215        self.push_i(1);
216        self.split();
217        self.push_i(1);
218        self.split();
219        self.push_i(1);
220        self.split();
221        self.swap();
222        self.bin_op("OP_CAT");
223        self.swap();
224        self.bin_op("OP_CAT");
225        self.swap();
226        self.bin_op("OP_CAT");
227    }
228
229    // --- LE <-> Numeric conversions ---
230
231    /// Convert 4-byte LE to unsigned script number. [le4] -> [num]. Net: 0. 3 ops.
232    fn le2num(&mut self) {
233        self.push_b(vec![0x00]); // unsigned padding
234        self.bin_op("OP_CAT");
235        self.uni_op("OP_BIN2NUM");
236    }
237
238    /// Convert script number to 4-byte LE (truncates to 32 bits). [num] -> [le4]. Net: 0. 5 ops.
239    fn num2le(&mut self) {
240        self.push_i(5);
241        self.bin_op("OP_NUM2BIN"); // 5-byte LE
242        self.push_i(4);
243        self.split(); // [4-byte LE, overflow+sign]
244        self.drop(); // discard overflow byte
245    }
246
247    // --- LE arithmetic ---
248
249    /// [a(LE), b(LE)] -> [(a+b mod 2^32)(LE)]. Net: -1. 13 ops.
250    fn add32(&mut self) {
251        self.le2num();
252        self.swap();
253        self.le2num();
254        self.bin_op("OP_ADD");
255        self.num2le();
256    }
257
258    /// Add N LE values. [v0..vN-1] (vN-1=TOS) -> [sum(LE)]. Net: -(N-1).
259    fn add_n(&mut self, n: usize) {
260        if n < 2 {
261            return;
262        }
263        self.le2num();
264        for _ in 1..n {
265            self.swap();
266            self.le2num();
267            self.bin_op("OP_ADD");
268        }
269        self.num2le();
270    }
271
272    // --- ROTR using OP_LSHIFT/OP_RSHIFT (native BE byte-array shifts) ---
273
274    /// ROTR(x, n) on BE 4-byte value. [x_BE] -> [rotated_BE]. Net: 0. 7 ops.
275    fn rotr_be(&mut self, n: usize) {
276        self.dup(); // [x, x]
277        self.push_i(n as i128);
278        self.bin_op("OP_RSHIFT"); // [x, x>>n]
279        self.swap(); // [x>>n, x]
280        self.push_i((32 - n) as i128);
281        self.bin_op("OP_LSHIFT"); // [x>>n, x<<(32-n)]
282        self.bin_op("OP_OR"); // [ROTR result]
283    }
284
285    // --- ROTR on LE values ---
286
287    /// ROTR(x, 16) on LE 4-byte value. Net: 0. 4 ops.
288    /// Swaps the two 16-bit halves: [b0,b1,b2,b3] -> [b2,b3,b0,b1].
289    fn rotr16_le(&mut self) {
290        self.push_i(2);
291        self.split(); // [lo2, hi2]
292        self.swap(); // [hi2, lo2]
293        self.bin_op("OP_CAT"); // [hi2||lo2]
294    }
295
296    /// ROTR(x, 8) on LE 4-byte value. Net: 0. 4 ops.
297    /// [b0,b1,b2,b3] -> [b1,b2,b3,b0]
298    fn rotr8_le(&mut self) {
299        self.push_i(1);
300        self.split(); // [b0, b1b2b3]
301        self.swap(); // [b1b2b3, b0]
302        self.bin_op("OP_CAT"); // [b1b2b3||b0]
303    }
304
305    /// ROTR(x, n) on LE 4-byte value (general, non-byte-aligned). Net: 0. 31 ops.
306    /// Converts LE->BE, applies rotr_be, converts back.
307    fn rotr_le_general(&mut self, n: usize) {
308        self.reverse_bytes4(); // LE -> BE (12 ops)
309        self.rotr_be(n); // rotate on BE (7 ops)
310        self.reverse_bytes4(); // BE -> LE (12 ops)
311    }
312
313    /// Convert N x BE words on TOS to LE, preserving stack order.
314    fn be_words_to_le(&mut self, n: usize) {
315        for _ in 0..n {
316            self.reverse_bytes4();
317            self.to_alt();
318        }
319        for _ in 0..n {
320            self.from_alt();
321        }
322    }
323}
324
325// =========================================================================
326// State word position tracker
327// =========================================================================
328
329/// Tracks the stack depth of each of the 16 state words.
330/// Depth 0 = TOS. Message words sit below the state area at fixed positions.
331struct StateTracker {
332    /// positions[i] = current depth of state word v[i] from TOS
333    positions: [i32; 16],
334}
335
336impl StateTracker {
337    fn new() -> Self {
338        let mut positions = [0i32; 16];
339        // Initial: v0 at depth 15 (deepest state word), v15 at depth 0 (TOS)
340        for i in 0..16 {
341            positions[i] = (15 - i) as i32;
342        }
343        StateTracker { positions }
344    }
345
346    fn depth(&self, word_idx: usize) -> i32 {
347        self.positions[word_idx]
348    }
349
350    /// Update after rolling a state word from its current depth to TOS.
351    fn on_roll_to_top(&mut self, word_idx: usize) {
352        let d = self.positions[word_idx];
353        for j in 0..16 {
354            if j != word_idx && self.positions[j] >= 0 && self.positions[j] < d {
355                self.positions[j] += 1;
356            }
357        }
358        self.positions[word_idx] = 0;
359    }
360}
361
362// =========================================================================
363// G function (quarter-round)
364// =========================================================================
365
366/// Emit one half of the G function.
367/// Stack entry: [a, b, c, d, m] (m on TOS) -- 5 items
368/// Stack exit:  [a', b', c', d'] (d' on TOS) -- 4 items
369/// Net depth: -1
370///
371/// Operations:
372///   a' = a + b + m
373///   d' = (d ^ a') >>> rotD
374///   c' = c + d'
375///   b' = (original_b ^ c') >>> rotB
376fn emit_half_g(em: &mut Emitter, rot_d: usize, rot_b: usize) {
377    let d0 = em.depth;
378
379    // Save original b for step 4 (b is at depth 3)
380    em.pick(3);
381    em.to_alt();
382
383    // Step 1: a' = a + b + m
384    // Stack: [a, b, c, d, m] -- a=4, b=3, c=2, d=1, m=0
385    em.roll(3); // [a, c, d, m, b]
386    em.roll(4); // [c, d, m, b, a]
387    em.add_n(3); // [c, d, a']
388    em.assert_depth(d0 - 2, "halfG step1");
389
390    // Step 2: d' = (d ^ a') >>> rotD
391    // Stack: [c, d, a'] -- c=2, d=1, a'=0
392    em.dup(); // [c, d, a', a']
393    em.rot(); // [c, a', a', d]
394    em.bin_op("OP_XOR"); // [c, a', (d^a')]
395    if rot_d == 16 {
396        em.rotr16_le();
397    } else if rot_d == 8 {
398        em.rotr8_le();
399    } else {
400        em.rotr_le_general(rot_d);
401    }
402    em.assert_depth(d0 - 2, "halfG step2");
403
404    // Step 3: c' = c + d'
405    // Stack: [c, a', d']
406    em.dup(); // [c, a', d', d']
407    em.roll(3); // [a', d', d', c]
408    em.add32(); // [a', d', c']
409    em.assert_depth(d0 - 2, "halfG step3");
410
411    // Step 4: b' = (original_b ^ c') >>> rotB
412    // Stack: [a', d', c']
413    em.from_alt(); // [a', d', c', b]
414    em.over(); // [a', d', c', b, c']
415    em.bin_op("OP_XOR"); // [a', d', c', (b^c')]
416    em.rotr_le_general(rot_b);
417    // Stack: [a', d', c', b']
418    em.assert_depth(d0 - 1, "halfG step4");
419
420    // Rearrange: [a', d', c', b'] -> [a', b', c', d']
421    em.swap(); // [a', d', b', c']
422    em.rot(); // [a', b', c', d']
423    em.assert_depth(d0 - 1, "halfG done");
424}
425
426/// Emit the full G function (quarter-round).
427/// Stack entry: [a, b, c, d, mx, my] (my on TOS) -- 6 items
428/// Stack exit:  [a', b', c', d'] (d' on TOS) -- 4 items
429/// Net depth: -2
430fn emit_g(em: &mut Emitter) {
431    let d0 = em.depth;
432
433    // Save my to alt for phase 2
434    em.to_alt(); // [a, b, c, d, mx]
435
436    // Phase 1: first half with mx, ROTR(16) and ROTR(12)
437    emit_half_g(em, 16, 12);
438    em.assert_depth(d0 - 2, "G phase1");
439
440    // Restore my for phase 2
441    em.from_alt(); // [a', b', c', d', my]
442    em.assert_depth(d0 - 1, "G before phase2");
443
444    // Phase 2: second half with my, ROTR(8) and ROTR(7)
445    emit_half_g(em, 8, 7);
446    em.assert_depth(d0 - 2, "G done");
447}
448
449// =========================================================================
450// G call with state management
451// =========================================================================
452
453/// Emit a single G call with state word roll management.
454///
455/// Rolls 4 state words (ai, bi, ci, di) to top, picks 2 message words,
456/// runs G, then updates tracker.
457fn emit_g_call(
458    em: &mut Emitter,
459    tracker: &mut StateTracker,
460    ai: usize,
461    bi: usize,
462    ci: usize,
463    di: usize,
464    mx_orig_idx: usize,
465    my_orig_idx: usize,
466) {
467    let d0 = em.depth;
468
469    // Roll 4 state words to top: a, b, c, d (d ends up as TOS)
470    for &idx in &[ai, bi, ci, di] {
471        let d = tracker.depth(idx) as usize;
472        em.roll(d);
473        tracker.on_roll_to_top(idx);
474    }
475
476    // Pick message words from below the 16 state word area
477    // m[i] is at depth: 16 (state words) + (15 - i)
478    em.pick(16 + (15 - mx_orig_idx));
479    em.pick(16 + (15 - my_orig_idx) + 1); // +1 for mx just pushed
480    em.assert_depth(d0 + 2, "before G");
481
482    // Run G: consumes 6 (a, b, c, d, mx, my), produces 4 (a', b', c', d')
483    emit_g(em);
484    em.assert_depth(d0, "after G");
485
486    // Update tracker: result words at depths 0-3
487    tracker.positions[ai] = 3;
488    tracker.positions[bi] = 2;
489    tracker.positions[ci] = 1;
490    tracker.positions[di] = 0;
491}
492
493// =========================================================================
494// Full compression ops generator
495// =========================================================================
496
497fn generate_compress_ops() -> Vec<StackOp> {
498    let mut em = Emitter::new(2);
499    let msg_schedule = compute_msg_schedule();
500
501    // ================================================================
502    // Phase 1: Unpack block into 16 LE message words
503    // ================================================================
504    // Stack: [chainingValue(32 BE), block(64 BE)]
505    // Split block into 16 x 4-byte BE words, convert to LE
506    for _ in 0..15 {
507        em.split4();
508    }
509    em.assert_depth(17, "after block unpack"); // 16 block words + 1 chainingValue
510    em.be_words_to_le(16);
511    em.assert_depth(17, "after block LE convert");
512    // Stack: [CV, m0(LE), m1(LE), ..., m15(LE)] -- m0 deepest of msg words, m15 TOS
513
514    // ================================================================
515    // Phase 2: Initialize 16-word state on top of message words
516    // ================================================================
517    // Move CV to alt (it's below the 16 msg words, at depth 16)
518    em.roll(16);
519    em.to_alt();
520    em.assert_depth(16, "after CV to alt");
521    // Stack: [m0, m1, ..., m15]  Alt: [CV]
522
523    // Get CV back, split into 8 LE words, place on top of msg
524    em.from_alt();
525    em.assert_depth(17, "after CV from alt");
526    for _ in 0..7 {
527        em.split4();
528    }
529    em.assert_depth(24, "after cv unpack");
530    em.be_words_to_le(8);
531    em.assert_depth(24, "after cv LE convert");
532    // Stack: [m0..m15, cv0(LE)..cv7(LE)]
533
534    // v[0..7] = chaining value (already on stack)
535    // v[8..11] = IV[0..3]
536    for i in 0..4 {
537        em.push_b(u32_to_le(BLAKE3_IV[i]));
538    }
539    em.assert_depth(28, "after IV push");
540
541    // v[12] = counter_low = 0, v[13] = counter_high = 0
542    em.push_b(u32_to_le(0));
543    em.push_b(u32_to_le(0));
544    // v[14] = block_len = 64
545    em.push_b(u32_to_le(64));
546    // v[15] = flags = CHUNK_START | CHUNK_END | ROOT = 11
547    em.push_b(u32_to_le(CHUNK_START | CHUNK_END | ROOT));
548    em.assert_depth(32, "after state init");
549
550    // Stack: [m0..m15(bottom), v0..v15(top)] -- v15=TOS, m0=deepest
551
552    // ================================================================
553    // Phase 3: 7 rounds of G function calls
554    // ================================================================
555    let mut tracker = StateTracker::new();
556
557    for round in 0..7 {
558        let s = &msg_schedule[round];
559
560        // Column mixing
561        emit_g_call(&mut em, &mut tracker, 0, 4, 8, 12, s[0], s[1]);
562        emit_g_call(&mut em, &mut tracker, 1, 5, 9, 13, s[2], s[3]);
563        emit_g_call(&mut em, &mut tracker, 2, 6, 10, 14, s[4], s[5]);
564        emit_g_call(&mut em, &mut tracker, 3, 7, 11, 15, s[6], s[7]);
565
566        // Diagonal mixing
567        emit_g_call(&mut em, &mut tracker, 0, 5, 10, 15, s[8], s[9]);
568        emit_g_call(&mut em, &mut tracker, 1, 6, 11, 12, s[10], s[11]);
569        emit_g_call(&mut em, &mut tracker, 2, 7, 8, 13, s[12], s[13]);
570        emit_g_call(&mut em, &mut tracker, 3, 4, 9, 14, s[14], s[15]);
571    }
572
573    em.assert_depth(32, "after all rounds");
574
575    // ================================================================
576    // Phase 4: Output -- hash[i] = state[i] XOR state[i+8], for i=0..7
577    // ================================================================
578
579    // Canonical reorder via alt stack
580    for i in (0..=15usize).rev() {
581        let d = tracker.depth(i);
582        em.roll(d as usize);
583        tracker.on_roll_to_top(i);
584        em.to_alt();
585        for j in 0..16 {
586            if j != i && tracker.positions[j] >= 0 {
587                tracker.positions[j] -= 1;
588            }
589        }
590        tracker.positions[i] = -1;
591    }
592
593    // Pop to get canonical order: [v0(bottom)..v15(TOS)]
594    for _ in 0..16 {
595        em.from_alt();
596    }
597    em.assert_depth(32, "after canonical reorder");
598
599    // State: [m0..m15, v0(bottom)..v15(TOS)], canonical order.
600    // XOR pairs: h[7-k] = v[7-k] ^ v[15-k] for k=0..7
601    // Process top-down: v15^v7, v14^v6, ..., v8^v0. Send each result to alt.
602    for k in 0..8usize {
603        em.roll(8 - k); // bring v[7-k] to TOS (past v[15-k] and remaining)
604        em.bin_op("OP_XOR"); // h[7-k] = v[7-k] ^ v[15-k]
605        em.to_alt(); // result to alt; main shrinks by 2
606    }
607    em.assert_depth(16, "after XOR pairs");
608    // Alt (bottom->top): h7, h6, h5, h4, h3, h2, h1, h0. Main: [m0..m15].
609
610    // Pop results to main: h0 first (LIFO), then h1, ..., h7
611    for _ in 0..8 {
612        em.from_alt();
613    }
614    em.assert_depth(24, "after XOR results restored");
615    // Main: [m0..m15, h0, h1, ..., h7] h7=TOS
616
617    // Pack into 32-byte BE result: h0_BE || h1_BE || ... || h7_BE
618    em.reverse_bytes4(); // h7 -> h7_BE
619    for _ in 1..8 {
620        em.swap(); // bring h[7-i] (LE) to TOS
621        em.reverse_bytes4(); // -> BE
622        em.swap(); // [new_BE, accumulated]
623        em.bin_op("OP_CAT"); // new_BE || accumulated
624    }
625    em.assert_depth(17, "after hash pack");
626
627    // Drop 16 message words
628    for _ in 0..16 {
629        em.swap();
630        em.drop();
631    }
632    em.assert_depth(1, "compress final");
633
634    em.ops
635}
636
637// Cache the ops since they're identical every time
638static COMPRESS_OPS: OnceLock<Vec<StackOp>> = OnceLock::new();
639
640fn get_compress_ops() -> &'static Vec<StackOp> {
641    COMPRESS_OPS.get_or_init(generate_compress_ops)
642}
643
644// =========================================================================
645// Public entry points
646// =========================================================================
647
648/// Emit BLAKE3 single-block compression in Bitcoin Script.
649/// Stack on entry: [..., chainingValue(32 BE), block(64 BE)]
650/// Stack on exit:  [..., hash(32 BE)]
651/// Net depth: -1
652pub fn emit_blake3_compress(emit: &mut dyn FnMut(StackOp)) {
653    for op in get_compress_ops() {
654        emit(op.clone());
655    }
656}
657
658/// Emit BLAKE3 hash for a message up to 64 bytes.
659/// Stack on entry: [..., message(<=64 BE)]
660/// Stack on exit:  [..., hash(32 BE)]
661/// Net depth: 0
662///
663/// Applies zero-padding and uses IV as chaining value.
664pub fn emit_blake3_hash(emit: &mut dyn FnMut(StackOp)) {
665    let mut em = Emitter::new(1);
666
667    // Pad message to 64 bytes (BLAKE3 zero-pads, no length suffix)
668    em.oc("OP_SIZE");
669    em.depth += 1; // [message, len]
670    em.push_i(64);
671    em.swap();
672    em.bin_op("OP_SUB"); // [message, 64-len]
673    em.push_i(0);
674    em.swap();
675    em.bin_op("OP_NUM2BIN"); // [message, zeros]
676    em.bin_op("OP_CAT"); // [paddedMessage(64)]
677
678    // Push IV as 32-byte BE chaining value
679    let mut iv_bytes = Vec::with_capacity(32);
680    for i in 0..8 {
681        iv_bytes.extend_from_slice(&u32_to_be(BLAKE3_IV[i]));
682    }
683    em.push_b(iv_bytes);
684    em.swap(); // [IV(32 BE), paddedMessage(64 BE)]
685
686    // Splice compression ops
687    let compress_ops = get_compress_ops();
688    for op in compress_ops {
689        em.e_raw(op.clone());
690    }
691    em.depth = 1;
692
693    em.assert_depth(1, "blake3Hash final");
694
695    for op in em.ops {
696        emit(op);
697    }
698}