dinoxor 0.3.0

Re-implements bitwise operations as abstractions in aarch64 neon registers.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
use std::arch::asm;

/// Perform a XOR operation on two bytes using ARM NEON instructions.
///
/// Parameters:
/// - `x`: First byte for the XOR operation.
/// - `y`: Second byte for the XOR operation.
///
/// Returns:
/// The result of the XOR operation, stored in `w0`.
///
/// Safety:
/// This function is unsafe because it uses low-level inline assembly and interacts with hardware registers directly.
///
/// Notes:
/// This function leverages ARM NEON instructions for obscure XOR operations. It internally calls several helper functions such as `spread_bits_to_bytes`, `prepare_xor_truth_table`, and others.
///
/// Examples:
/// ```no_run
/// use dinoxor::dinoxor::dinoxor;
///
/// let result = dinoxor(0b11101011, 0b11111111);
/// assert_eq!(result, 0b10100 );
///
/// ```
pub fn dinoxor(x: u8, y: u8) -> u8 {
    let result: u8;

    unsafe {
        asm!(
            "mov x2, #0",                           // Initialize x2 to 0 (not used later)
            "eor v0.16b, v0.16b, v0.16b",           // Clear the contents of v0 (set all bits to 0)

            "mov w0, {x:w}",                        // Move the first operand byte into w0

            "bl {spread_bits_to_bytes}",            // Call the spread_bits_to_bytes function to load the first operand byte into the lower half of v2
            "mov w0, {y:w}",                        // Move the second operand byte into w0
            "bl {spread_bits_to_bytes}",            // Call the spread_bits_to_bytes function to load the second operand byte into the lower half of v2
                                                    // (shifting the previous value to upper)

            "bl {prepare_xor_truth_table}",         // Call the prepare_xor_truth_table function to initialize the XOR truth table in v0
            "bl {prepare_multiplication_table}",    // Call the prepare_multiplication_table function to initialize the multiplication table in v1
            "bl {calculate_xor_result}",            // Call the calculate_xor_result function to calculate and store the XOR'd byte in w0.

            x = in(reg) x,
            y = in(reg) y,
            lateout("w0") result,
            clobber_abi("C"),

            spread_bits_to_bytes = sym spread_bits_to_bytes,
            prepare_xor_truth_table = sym prepare_xor_truth_table,
            prepare_multiplication_table = sym prepare_multiplication_table,
            calculate_xor_result = sym calculate_xor_result
        );
    }

    result
}

/// Compresses the last 8 bytes of the NEON register `v2` into a single u8. Each bit of the result corresponds to one of the last 8 bytes of `v2`. A bit is set if the corresponding byte is non-zero. This function is only used in unit tests.
///
/// # Parameters:
/// - `v2`: A NEON register holding 16 bytes. The function processes the last 8 bytes of this register.
///
/// # Returns:
/// - A `u8` value where each bit represents whether the corresponding byte in `v2` is non-zero.
///
/// # Safety:
/// This function is marked as unsafe because it directly manipulates NEON registers and relies on specific hardware instructions. Use with care.
///
/// # Notes:
/// - This function is intended for use in unit tests only, as it leverages imperative operations for explanatory code.
/// - The assembly code is specific to ARM64 architecture.
///
/// # Examples:
/// ```no_run
/// use dinoxor::dinoxor::_compress_bytes_to_bits;
/// let v2 = unsafe { _compress_bytes_to_bits() };
/// assert_eq!(v2, 0); // Example: if all bytes in v2 are zero
///
/// ```
pub unsafe fn _compress_bytes_to_bits() -> u8 {
    let result: u8;
    asm!(
        // Initialize the result
        "eor w0, w0, w0",              // Clear the result register (w0 will hold the final byte)

        // Check each byte and set the corresponding bit in `w0`
        "umov w1, v2.b[8]",            // Move the byte at position 0 in v2 to w1
        "orr w0, w0, w1, lsl #0",      // Set bit 0 in w0 if w1 is non-zero
        "umov w1, v2.b[9]",            // Move the byte at position 1 in v2 to w1
        "orr w0, w0, w1, lsl #1",      // Set bit 1 in w0 if w1 is non-zero
        "umov w1, v2.b[10]",            // Repeat for each byte
        "orr w0, w0, w1, lsl #2",
        "umov w1, v2.b[11]",
        "orr w0, w0, w1, lsl #3",
        "umov w1, v2.b[12]",
        "orr w0, w0, w1, lsl #4",
        "umov w1, v2.b[13]",
        "orr w0, w0, w1, lsl #5",
        "umov w1, v2.b[14]",
        "orr w0, w0, w1, lsl #6",
        "umov w1, v2.b[15]",
        "orr w0, w0, w1, lsl #7",
        "mov {result:w}, w0",

        // Output the result
        result = out(reg) result,          // Output the result byte
    );
    result
}

/// Expands the 8 bits of the input byte in `w0` into eight 0x00/0x01 bytes in a NEON vector.
///
/// This routine reads the caller-provided input byte from the **AArch64** integer
/// register `w0` and *spreads* each of its 8 bits into a separate byte lane of a
/// vector register. The result is placed in **`v2`** (low 8 bytes), where each lane
/// is either `0x00` (bit = 0) or `0x01` (bit = 1). The bits are processed least-
/// significant first (bit 0 → byte 0, …, bit 7 → byte 7).
///
/// > **Summary:**  
/// > Input: `w0 = b7 b6 b5 b4 b3 b2 b1 b0` (bits)  
/// > Output: `v2.16b[0..7] = {b0, b1, b2, b3, b4, b5, b6, b7}` as bytes 0x00/0x01
///
/// # How it works
/// - Clears working vectors (`v1`, `v2`) and a scratch scalar (`w3`).
/// - Loops `w2 = 0..7`:
///   1. `lsr w3, w0, w2` → move bit *w2* of the input to LSB of `w3`.
///   2. `and w3, w3, #1` → isolate that bit (0 or 1).
///   3. `ext v2, v0, v0, #1` → shift the current byte vector state left by one byte.
///   4. `ins v2.b[0], w3` → insert the new 0/1 byte at lane 0.
///   5. `mov v0, v2` → commit the updated vector into `v0` for the next iteration.
/// - Final `ext v2, v0, v0, #1` aligns the last inserted byte (so the low 8 bytes of
///   `v2` hold the eight bit-bytes in order).
///
/// # Registers (inputs/outputs/clobbers)
/// - **Input:** `w0` (caller supplies the byte to spread).
/// - **Output:** **`v2.16b`** (low 8 lanes contain 0x00/0x01 for bits 0..7).
/// - **Clobbers:** `v0`, `v1`, `v2`, `w2`, `w3`, `w4`.
///
/// # Preconditions
/// - Target must be **AArch64** with NEON/AdvSIMD.
/// - The caller must place the input byte in `w0`.
/// - This routine assumes it can freely clobber `v0`,`v1`,`v2` and scratch scalars
///   `w2`,`w3`,`w4`. If you need to preserve them, save/restore at the call site.
/// - No stack or memory is touched (`options(nostack, nomem)`), but this function is
///   still `unsafe` as it relies on target-specific inline assembly and register
///   conventions.
///
/// # Semantics (pseudocode)
/// ```text
/// v := [0u8; 16]
/// for i in 0..8 {
///   bit := ((w0 >> i) & 1) as u8
///   v = shift_left_by_1_byte(v)      // drop highest byte, open slot at index 0
///   v[0] = bit                       // write 0x00 or 0x01
/// }
/// v = shift_left_by_1_byte(v)        // align
/// // Result: v2 := v ; v2[0..7] = [b0,b1,b2,b3,b4,b5,b6,b7] as bytes
/// ```
///
/// # Example
/// ```rust,ignore
/// // Requires AArch64 + NEON + your symbol being linked:
/// // Put input in w0, call the function, then read v2.
/// use std::arch::asm;
///
/// let x: u8 = 0b1011_0010; // b7..b0 = 1 0 1 1 0 0 1 0
/// let out: [u8; 16];
/// unsafe {
///     asm!(
///         "mov w0, {x}",
///         "bl {spread_bits_to_bytes}",
///         // Store v2 to memory to inspect (example using st1 intrinsic would be cleaner):
///         "st1 {{v2.16b}}, [{out}]",
///         x = in(reg) x,
///         out = in(reg) out.as_ptr(), // for illustration; in real code, use proper NEON stores/binds
///         spread_bits_to_bytes = sym spread_bits_to_bytes,
///         options(nostack)
///     );
/// }
/// // Now out[0..8] are 0x00/0x01 for bits [b0..b7].
/// ```
///
/// # Notes
/// - This routine is typically paired with a companion that **compresses** the eight
///   0/1 bytes back into a single bitfield (e.g., `_compress_bytes_to_bits`), forming
///   a round-trip suitable for property tests (QuickCheck).
/// - If you intend to chain calls or reuse `v0`, remember this function clobbers
///   `v0` while constructing the shifted stream; the final result is in `v2`.
///
/// # Safety
/// `unsafe` because:
/// - Uses inline assembly with hard-coded registers and AArch64-specific instructions.
/// - No ABI-level guarantees are made beyond the documented clobbers and the input in `w0`.
/// - The caller must ensure the target supports the required features and that clobbered
///   registers are not needed across the call.
///
/// # Performance
/// - Pure register dataflow; no memory traffic (`nostack`, `nomem`).
/// - The loop runs a fixed 8 iterations; suitable for inlining in hot paths.
pub unsafe extern "C" fn spread_bits_to_bytes() {
    asm!(
        // Clear the destination vector registers
        "eor v1.16b, v1.16b, v1.16b",
        "eor v2.16b, v2.16b, v2.16b",
        "eor w3, w3, w3", // Clear the scratch register
        "mov w2, #0",     // Initialize the counter for bit positions (0-7)
        "1:",
        "lsr w3, w0, w2", // Shift the input byte right by the current bit position to bring the target bit to the LSB
        "and w3, w3, #0x01", // Isolate the LSB (which is now the target bit)
        "mov w4, w3", // Move the processed bit to w4 (to ensure w4 is correctly updated before duplication)
        "ext v2.16b, v0.16b, v0.16b, #1", // Shift v0 left by one byte to make space for the new byte
        "ins v2.b[0], w4",                // Insert the new byte at position 0 of v2
        "mov v0.16b, v2.16b",             // Move the temporary result back to v0
        "add w2, w2, #1",                 // Increment the bit position counter
        "cmp w2, #8",                     // Compare the counter with 8 (number of bits in a byte)
        "b.lt 1b",                        // If the counter is less than 8, continue the loop
        "ext v2.16b, v0.16b, v0.16b, #1", // Shift the last byte inserted into its final position in v2
        "ret",
        options(nostack, nomem)
    );
}

/// Prepares the XOR truth table using inline assembly.
///
/// This function uses inline ARMv8 assembly to generate a 16-bit XOR truth table
/// in vector register v0. Two patterns are combined to form the truth table:
/// - `0x0001`, placed in the upper half of the 32-bit word
/// - `0x0100`, overlaid into the lower half of the 32-bit word
///
/// The result is the 32-bit value `0x0001_0100`, which is then duplicated across
/// all lanes of the vector register v0. This produces a 16-byte sequence encoding
/// the XOR truth table pattern.
///
/// Parameters: None. This function does not take any parameters and is called
/// directly without arguments.
///
/// Returns: None. This function does not return a value and is used internally
/// for preparing the XOR truth table.
///
/// Safety: The function is marked as `unsafe` and uses inline assembly, which
/// may carry risks of undefined behavior if not used carefully. It is intended
/// for low-level operations and should be handled with care.
///
/// Notes: The assembly code first loads `0x0001` into the upper half of the
/// register, then overlays `0x0100` into the lower half, producing the combined
/// value `0x0001_0100`. This is duplicated into all lanes of v0, yielding:
/// `v0 = {0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00}`
///
/// Examples: None. This function is called internally and should not be used
/// directly in user code.
///
/// Options: The `nostack` and `nomem` options are used to prevent the function
/// from using the stack or memory, making it suitable for use in performance-critical sections.
pub unsafe extern "C" fn prepare_xor_truth_table() {
    asm!(
        // Load a pattern into a general-purpose register
        "movz w4, #0x0001, lsl #16", // Load 0x0001 into the upper half of w8 (bits 16-31)
        "movk w4, #0x0100",          // Overlay 0x0100 into the lower half of w8 (bits 0-15)
        "dup v0.4s, w4",             // Duplicate the 32-bit value in w8 across all lanes of v0
        // After the above operation, v0 contains:
        // v0 = {0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00}
        "ret",
        options(nostack, nomem)
    );
}

/// Prepares byte constants for a multiplication table using Arm AdvSIMD (NEON).
///
/// This routine does **not** perform any multiplication by itself. Instead, it
/// constructs a 128-bit vector in `v1` whose **lower 64-bit lane** is filled
/// with `0x02` bytes and whose **upper 64-bit lane** is filled with `0x01`
/// bytes. These constants can then be used elsewhere to build a multiplication
/// table or as operands for vectorized multiply instructions.
///
/// How it works:
/// - `movi v1.8b, #0x02` writes `0x02` into each of the 8 bytes of the **lower
///   64-bit lane** of `v1` (the upper 64 bits are unchanged here).
/// - `movi v8.8b, #0x01` writes `0x01` into each of the 8 bytes of the **lower
///   64-bit lane** of `v8`.
/// - `mov v1.d[1], v8.d[0]` copies `v8`’s lower 64-bit lane into `v1`’s upper
///   64-bit lane, overlaying that lane with `0x01` bytes.
///
/// Final state:
/// `v1 = { 0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02, 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01 }`
///
/// # Safety
/// This function is `unsafe` because it uses inline assembly. Although it is
/// marked with `options(nostack, nomem)` to declare that it neither touches the
/// stack nor arbitrary memory, the caller must still ensure the target CPU
/// supports the used AdvSIMD instructions and that the call site is sound.
///
/// # Parameters
/// None.
///
/// # Returns
/// `()`
///
/// # Errors
/// None.
///
/// # Notes
/// The inline assembly includes a `ret` instruction to return directly from the
/// function body.
///
/// # Examples
/// This function is intended to be called internally by code that consumes the
/// prepared constants; it is not typically invoked directly.
pub unsafe extern "C" fn prepare_multiplication_table() {
    asm!(
        // Load the patterns into NEON registers
        "movi v1.8b, #0x02",    // Set the lower half of v1 to 0x02
        "movi v8.8b, #0x01",    // Set the lower half of v8 to 0x01
        "mov v1.d[1], v8.d[0]", // Move the lower half of v8 to the upper half of v1
        // After the above operations, v1 contains:
        // v1 = {0x02 0x02 0x02 0x02 0x02 0x02 0x02 0x02 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01}
        "ret",
        options(nostack, nomem)
    );
}

/// Computes the XOR of two 16-bit values using SIMD index formation, a table
/// lookup, and a mask-and-reduce step (Armv8-A AdvSIMD/NEON).
///
/// This routine builds per-bit indices for the pair `(A, B)`, looks up the XOR
/// result for each bit through a 4-entry truth table, then converts those
/// per-bit results into a single scalar by masking and horizontally adding.
///
/// # How it works
///
/// ## 1) Index formation (from spread bits)
/// The inputs are assumed to be **already expanded** into byte lanes:
/// - `v2.16b` carries the per-bit **X indexes** for `A`
/// - `v1.16b` carries the per-bit **Y indexes** for `B` (as a small table of
///   multipliers)
///
/// ```text
/// mul v3.16b, v2.16b, v1.16b   ; element-wise partial indices
/// mov v3.d[1], v2.d[1]         ; move upper (Y) half to v3's low half
/// ext v1.16b, v3.16b, v3.16b, #8
/// add v1.16b, v3.16b, v1.16b   ; v1 := final indices 0..3 per bit
/// mov v1.d[1], xzr             ; zero the upper half (only low 8 bytes used)
/// ```
///
/// After this, each **low-half** byte of `v1` is an index in `{0,1,2,3}` that
/// encodes the bit pair `(A_i, B_i)` for bit position `i`.
///
/// ### Index map for a single bit
/// ```text
/// Index | (A,B) | XOR(A,B)
/// ------+-------+---------
///   0   |  0 0  |    0
///   1   |  0 1  |    1
///   2   |  1 0  |    1
///   3   |  1 1  |    0
/// ```
///
/// ## 2) Truth-table lookup (branchless XOR per bit)
/// `v0` must **already contain** the XOR truth table (prepared earlier by a
/// helper like `prepare_xor_truth_table`). The table is organized so that
/// `tbl` returns **0x00** for XOR=0 and **0x01** for XOR=1.
///
/// ```text
/// tbl v1.8b, {v0}, v1.8b   ; for each lane i, v1[i] := T[v1[i]]
/// ```
///
/// After `tbl`, the **low 8 bytes** of `v1` are either `0x00` (if A_i XOR B_i = 0)
/// or `0x01` (if A_i XOR B_i = 1).
///
/// ## 3) Masking and reduction to a scalar
/// Immediately after the lookup, we **repurpose** `v0` as a weight/mask vector
/// for bit-packing the 8 boolean results into a single byte. The constant
/// written into `v0.d[0]` is:
///
/// ```text
/// 0x0201_0804_2010_8040   (little-endian byte order in v0.d[0])
/// bytes: [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80]
/// ```
///
/// That is, `v0.8b = [1,2,4,8,16,32,64,128]`, i.e., one power-of-two weight per
/// bit position (LSB→MSB).
///
/// ### Masking diagram (per byte lane in the low half)
/// Let `r_i ∈ {0,1}` be the table result for bit position `i`.
///
/// ```text
/// Before mul:
///   v1.8b = [r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7]
///   v0.8b = [  1,   2,   4,   8,  16,  32,  64, 128]
///
/// Element-wise multiply:
///   v1.8b := v1.8b * v0.8b
///         = [r_0*1, r_1*2, r_2*4, r_3*8, r_4*16, r_5*32, r_6*64, r_7*128]
///
/// Horizontal add (low half):
///   addv b0, v1.8b  =>  b0 = Σ_i (r_i * 2^i)
/// ```
///
/// Thus `b0` becomes the 8-bit packed XOR of the low 8 bit positions. The code
/// then copies that scalar byte into `w0` as the return value.
///
/// ### End-to-end bit-flow (one low-half example)
/// ```text
/// (A,B) bits → indices (0..3) → tbl → r_i (0/1) → weights → sum
///
///   A: 1 0 1 1 0 0 1 0
///   B: 0 1 0 1 1 0 1 0
/// XOR:1 1 1 0 1 0 0 0  → r_i
///
/// r_i: [1,1,1,0,1,0,0,0]
/// mul with weights [1,2,4,8,16,32,64,128]:
///       [1,2,4,0,16,0,0,0]
/// addv → 1+2+4+16 = 23 → 0b00010111
/// ```
///
/// # Safety
/// - `unsafe` due to inline assembly and assumptions about register contents.
/// - Valid only on AArch64 targets with AdvSIMD (NEON).
/// - Caller must ensure:
///   - `v2`/`v1` are initialized as described (spread/index sources).
///   - `v0` holds the XOR truth table **before** the `tbl` step.
///   - Clobbered registers are acceptable at the call site.
/// - `options(nostack, nomem)` declares no stack/memory effects; register state
///   remains the caller’s responsibility.
///
/// # Notes
/// - Assembler syntaxes vary slightly; e.g., some toolchains require
///   `tbl v1.8b, {v0.16b}, v1.8b` and `ext v1.16b, v3.16b, v3.16b, #8`.
/// - The function clears the upper 64 bits of `v1` and reduces over the **low
///   8 bytes** only (`addv b0, v1.8b`).
///
/// # Returns
/// The packed XOR of the low 8 bit positions, in `w0`’s low byte. (If your
/// calling convention expects the 16-bit XOR value, extend/accumulate across
/// two halves accordingly.)
///
/// # Example
/// ```no_run
/// use dinoxor::dinoxor::calculate_xor_result;
///
/// unsafe {
///     // ...prepare v2 (X indexes) and v1 (Y multipliers), and preload v0 with the XOR table...
///     calculate_xor_result();
/// }
/// ```
pub unsafe extern "C" fn calculate_xor_result() {
    asm!(
        "mul v3.16b, v2.16b, v1.16b", // Multiply each byte in v2 (spread bits) by its corresponding byte in v1 (multiplication table)
        // The upper half of v3 now contains the relevant Xindexes
        "mov v3.d[1], v2.d[1]", // Move the upper half of v2 (Yindexes) to the lower half of v3
        "ext.16b v1, v3, v3, #8", // Extract the upper half of v3 and store it in v1
        "add.16b v1, v3, v1",   // Add v3 and v1 to get the final indices for the truth table lookup
        "mov.d v1[1], xzr",     // Clear the upper half of v1 (set it to 0)
        "tbl.8b v1, {{v0}}, v1", // Perform a table lookup using the indices in v1 and the truth table in v0
        // Store the result in v1
        // Set up v0 with the desired values for the multiplication
        "movz x1, #0x0201, lsl #0", // Load the lower 16 bits of x1 with 0x0201
        "movk x1, #0x0804, lsl #16", // Load the next 16 bits of x1 with 0x0804
        "movk x1, #0x2010, lsl #32", // Load the next 16 bits of x1 with 0x2010
        "movk x1, #0x8040, lsl #48", // Load the upper 16 bits of x1 with 0x8040
        "mov v0.d[0], x1",          // Move the 64-bit value from x1 to the lower half of v0
        "mul v1.16b, v1.16b, v0.16b", // Multiply v1 (table lookup result) by v0 (predefined pattern) element-wise
        "addv b0, v1.8b", // Sum the values in the lower half of v1 and store the result in b0 (alias for lower 32 bit scalar in v0)
        "umov w0, v0.b[0]", // Move the 8-bit scalar value from b0 to w0 (return value)
        "ret",
        options(nostack, nomem)
    );
}