hashx 0.7.1

Hash function family designed for ASIC-resistant client puzzles
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
//! Scheduling model for program generation
//!
//! HashX uses a simple scheduling model inspired by the Intel Ivy Bridge
//! microarchitecture to choose registers that should be available and
//! avoid stalls.

use crate::program::{Instruction, Opcode};
use crate::register::{NUM_REGISTERS, RegisterId};

/// Scheduling information for each opcode
mod model {
    use crate::program::Opcode;

    /// Number of simulated cycles we run before stopping program generation
    pub(super) const TARGET_CYCLES: usize = 192;

    /// Total number of cycles to store schedule data for
    pub(super) const SCHEDULE_SIZE: usize = TARGET_CYCLES + MAX_LATENCY;

    /// Number of execution ports in our simulated microarchitecture
    pub(super) const NUM_EXECUTION_PORTS: usize = 3;

    /// Identify one or more CPU execution ports
    ///
    /// They are inspired by the Intel Ivy Bridge design and the original HashX
    /// implementation names them P5, P0, and P1.
    ///
    /// We can mostly ignore the names and treat this as an array of three
    /// ports specified in the order that HashX tries to allocate them. The
    /// original HashX implementation explains this allocation order as an
    /// optimistic strategy which assumes P1 will typically be reserved for
    /// multiplication.
    #[derive(Debug, Clone, Copy, Eq, PartialEq)]
    pub(super) struct ExecPorts(pub(super) u8);

    /// Port P5 (first choice) only
    const P5: ExecPorts = ExecPorts(1 << 0);
    /// Port P0 (second choice) only
    const P0: ExecPorts = ExecPorts(1 << 1);
    /// Port P1 (third choice) only
    const P1: ExecPorts = ExecPorts(1 << 2);
    /// Either port P0 or P1
    const P01: ExecPorts = ExecPorts(P0.0 | P1.0);
    /// Either port P0 or P5
    const P05: ExecPorts = ExecPorts(P0.0 | P5.0);
    /// Any of the three ports
    const P015: ExecPorts = ExecPorts(P0.0 | P1.0 | P5.0);

    /// Maximum value of [`instruction_latency_cycles()`] for any Opcode
    const MAX_LATENCY: usize = 4;

    /// Latency for each operation, in cycles
    #[inline(always)]
    pub(super) fn instruction_latency_cycles(op: Opcode) -> usize {
        match op {
            Opcode::AddConst => 1,
            Opcode::AddShift => 1,
            Opcode::Branch => 1,
            Opcode::Rotate => 1,
            Opcode::Sub => 1,
            Opcode::Target => 1,
            Opcode::Xor => 1,
            Opcode::XorConst => 1,
            Opcode::Mul => 3,
            Opcode::SMulH => 4,
            Opcode::UMulH => 4,
        }
    }

    /// Break an instruction down into one or two micro-operation port sets.
    #[inline(always)]
    pub(super) fn micro_operations(op: Opcode) -> (ExecPorts, Option<ExecPorts>) {
        match op {
            Opcode::AddConst => (P015, None),
            Opcode::Sub => (P015, None),
            Opcode::Xor => (P015, None),
            Opcode::XorConst => (P015, None),
            Opcode::Mul => (P1, None),
            Opcode::AddShift => (P01, None),
            Opcode::Rotate => (P05, None),
            Opcode::SMulH => (P1, Some(P5)),
            Opcode::UMulH => (P1, Some(P5)),
            Opcode::Branch => (P015, Some(P015)),
            Opcode::Target => (P015, Some(P015)),
        }
    }

    /// Each instruction advances the earliest possible issuing cycle by one
    /// sub-cycle per micro-op.
    #[inline(always)]
    pub(super) fn instruction_sub_cycle_count(op: Opcode) -> usize {
        match micro_operations(op) {
            (_, None) => 1,
            (_, Some(_)) => 2,
        }
    }
}

/// One single execution port
///
/// Formatted to use easily as an array index.
/// It's helpful for this to be a compact data type.
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
struct ExecPortIndex {
    /// Equivalent to a bit position in [`model::ExecPorts`].
    index: u8,
}

/// Overall state for the simulated execution schedule
#[derive(Debug, Default, Clone)]
pub(crate) struct Scheduler {
    /// Current timestamp in the schedule, in sub-cycles
    ///
    /// Sub-cycles advance as code is generated, modeling the time taken to
    /// fetch and decode instructions.
    sub_cycle: SubCycle,

    /// Current timestamp in the schedule, in cycles
    ///
    /// Derived from sub_cycle.
    cycle: Cycle,

    /// State for scheduling execution by fitting micro-ops into execution ports
    exec: ExecSchedule,

    /// State for scheduling register access by keeping track of data latency
    data: DataSchedule,
}

impl Scheduler {
    /// Create a new empty execution schedule at cycle 0
    #[inline(always)]
    pub(crate) fn new() -> Self {
        Default::default()
    }

    /// Stall for one cycle.
    ///
    /// Used when register allocation fails.
    /// Returns `Ok` if we had enough time, or `Err` if we ran out.
    #[inline(always)]
    pub(crate) fn stall(&mut self) -> Result<(), ()> {
        self.advance(SubCycle::PER_CYCLE as usize)
    }

    /// Return the current instruction fetch/decode timestamp in sub-cycles.
    #[inline(always)]
    pub(crate) fn instruction_stream_sub_cycle(&self) -> SubCycle {
        self.sub_cycle
    }

    /// Advance forward in time by some number of sub-cycles.
    ///
    /// Stops just before reaching `Cycle::target()`, where we stop
    /// scheduling new instructions.
    #[inline(always)]
    fn advance(&mut self, n: usize) -> Result<(), ()> {
        let sub_cycle = self.sub_cycle.add_usize(n)?;
        let cycle = sub_cycle.cycle();
        if cycle < Cycle::target() {
            self.sub_cycle = sub_cycle;
            self.cycle = cycle;
            Ok(())
        } else {
            Err(())
        }
    }

    /// Advance time forward by the modeled duration of the instruction fetch
    /// and decode, in sub-cycles.
    ///
    /// Returns Ok if we still have enough time to schedule more, or Err if the
    /// schedule would be full.
    #[inline(always)]
    pub(crate) fn advance_instruction_stream(&mut self, op: Opcode) -> Result<(), ()> {
        self.advance(model::instruction_sub_cycle_count(op))
    }

    /// Calculate a timing plan describing the cycle and execution units
    /// on which a particular opcode could run, at the earliest.
    #[inline(always)]
    pub(crate) fn instruction_plan(&self, op: Opcode) -> Result<InstructionPlan, ()> {
        self.exec.instruction_plan(self.cycle, op)
    }

    /// Commit to using a plan returned by [`Self::instruction_plan()`],
    /// for a particular concrete [`Instruction`] instance.
    ///
    /// Marks as busy each execution unit cycle in the plan, and updates the
    /// latency for the [`Instruction`]'s destination register if it has one.
    #[inline(always)]
    pub(crate) fn commit_instruction_plan(&mut self, plan: &InstructionPlan, inst: &Instruction) {
        self.exec.mark_instruction_busy(plan);
        if let Some(dst) = inst.destination() {
            self.data
                .plan_register_write(dst, plan.cycle_retired(inst.opcode()));
        }
    }

    /// Look up if a register will be available at or before the indicated cycle.
    #[inline(always)]
    pub(crate) fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
        self.data.register_available(reg, cycle)
    }

    /// Return the overall data latency.
    ///
    /// This is the Cycle at which we expect every register
    /// to reach its final simulated state.
    #[inline(always)]
    pub(crate) fn overall_latency(&self) -> Cycle {
        self.data.overall_latency()
    }
}

/// Cycle timestamp
///
/// Measured from the beginning of the program, assuming no branches taken.
/// It's useful to be able to store these compactly in register latency arrays.
#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
pub(crate) struct Cycle(u8);

impl Cycle {
    /// HashX stops generating code once the issue cycle reaches this target.
    #[inline(always)]
    fn target() -> Self {
        Cycle(
            model::TARGET_CYCLES
                .try_into()
                .expect("Cycle type wide enough for target count"),
        )
    }

    /// Cast this Cycle count to a usize losslessly.
    #[inline(always)]
    pub(crate) fn as_usize(&self) -> usize {
        self.0.into()
    }

    /// Add an integer number of cycles, returning Err(()) if we reach the end.
    #[inline(always)]
    fn add_usize(&self, n: usize) -> Result<Self, ()> {
        let result = self.as_usize() + n;
        if result < model::SCHEDULE_SIZE {
            Ok(Cycle(
                result
                    .try_into()
                    .expect("Cycle type wide enough for full schedule size"),
            ))
        } else {
            Err(())
        }
    }
}

/// Sub-cycle timestamp
///
/// Timer for instruction decode, at a finer resolution than the Cycle
/// we use for keeping schedule records. Doesn't need to be compact.
#[derive(Debug, Default, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
pub(crate) struct SubCycle(u16);

impl SubCycle {
    /// Number of sub-cycles per cycle
    const PER_CYCLE: u16 = 3;

    /// Maximum value of a sub-cycle counter, based on the schedule size
    const MAX: Self = SubCycle(model::SCHEDULE_SIZE as u16 * Self::PER_CYCLE - 1);

    /// Cast this sub-cycle count into a usize losslessly.
    #[inline(always)]
    pub(crate) fn as_usize(&self) -> usize {
        self.0.into()
    }

    /// Convert this sub-cycle into a full Cycle timestamp.
    #[inline(always)]
    fn cycle(&self) -> Cycle {
        Cycle(
            (self.0 / Self::PER_CYCLE)
                .try_into()
                .expect("Cycle type wide enough for conversion from SubCycle"),
        )
    }

    /// Advance by an integer number of sub-cycles.
    ///
    /// Returns the new advanced [`SubCycle`], or `Err(())`
    /// if we reach the end of the schedule.
    #[inline(always)]
    fn add_usize(&self, n: usize) -> Result<Self, ()> {
        let result = self.as_usize() + n;
        if result < Self::MAX.0.into() {
            Ok(SubCycle(result.try_into().expect(
                "SubCycle type wide enough for full schedule size",
            )))
        } else {
            Err(())
        }
    }
}

/// Busy tracking for one CPU execution port
#[derive(Debug, Default, Clone)]
struct PortSchedule {
    /// Bit array indicating when this port is busy, indexed by Cycle
    busy: SimpleBitArray<
        { (usize::BITS as usize + model::SCHEDULE_SIZE - 1) / (usize::BITS as usize) },
    >,
}

/// Latency tracking for all relevant CPU registers
#[derive(Debug, Default, Clone)]
struct DataSchedule {
    /// Cycle count at which this register is available, indexed by register ID
    latencies: [Cycle; NUM_REGISTERS],
}

impl DataSchedule {
    /// Plan to finish a register write at the indicated cycle
    #[inline(always)]
    fn plan_register_write(&mut self, dst: RegisterId, cycle: Cycle) {
        self.latencies[dst.as_usize()] = cycle;
    }

    /// Look up if a register will be available at or before the indicated cycle.
    #[inline(always)]
    fn register_available(&self, reg: RegisterId, cycle: Cycle) -> bool {
        self.latencies[reg.as_usize()] <= cycle
    }

    /// Return the overall latency, the [`Cycle`] at which we expect
    /// every register to reach its latest simulated state.
    #[inline(always)]
    fn overall_latency(&self) -> Cycle {
        match self.latencies.iter().max() {
            Some(cycle) => *cycle,
            None => Default::default(),
        }
    }
}

/// Execution schedule for all ports
///
/// This is a scoreboard that keeps track of which CPU units are busy in which
/// cycles, so we can estimate a timestamp at which each instruction will write
/// its result.
#[derive(Debug, Default, Clone)]
struct ExecSchedule {
    /// Execution schedule (busy flags) for each port
    ports: [PortSchedule; model::NUM_EXECUTION_PORTS],
}

impl ExecSchedule {
    /// Calculate the next cycle at which we could schedule one micro-op.
    ///
    /// HashX always searches execution ports in the same order, and it will
    /// look ahead up to the entire length of the schedule before failing.
    #[inline(always)]
    fn micro_plan(&self, begin: Cycle, ports: model::ExecPorts) -> Result<MicroOpPlan, ()> {
        let mut cycle = begin;
        loop {
            for index in 0..(model::NUM_EXECUTION_PORTS as u8) {
                if (ports.0 & (1 << index)) != 0
                    && !self.ports[index as usize].busy.get(cycle.as_usize())
                {
                    return Ok(MicroOpPlan {
                        cycle,
                        port: ExecPortIndex { index },
                    });
                }
            }
            cycle = cycle.add_usize(1)?;
        }
    }

    /// Mark the schedule busy according to a previously calculated plan.
    #[inline(always)]
    fn mark_micro_busy(&mut self, plan: MicroOpPlan) {
        self.ports[plan.port.index as usize]
            .busy
            .set(plan.cycle.as_usize(), true);
    }

    /// Calculate a timing plan describing the cycle and execution units
    /// we could use for scheduling one entire instruction.
    ///
    /// This takes place after the [`Opcode`] has been chosen but before
    /// a full [`Instruction`] can be assembled.
    #[inline(always)]
    fn instruction_plan(&self, begin: Cycle, op: Opcode) -> Result<InstructionPlan, ()> {
        match model::micro_operations(op) {
            // Single-op instructions
            (single_port, None) => {
                InstructionPlan::from_micro_plans(self.micro_plan(begin, single_port)?, None)
            }

            // HashX schedules two-op instructions by searching forward
            // until we find the first cycle where both ports are
            // simultaneously available.
            (first_port, Some(second_port)) => {
                let mut cycle = begin;
                loop {
                    if let (Ok(first_plan), Ok(second_plan)) = (
                        self.micro_plan(cycle, first_port),
                        self.micro_plan(cycle, second_port),
                    ) {
                        if let Ok(joint_plan) =
                            InstructionPlan::from_micro_plans(first_plan, Some(second_plan))
                        {
                            return Ok(joint_plan);
                        }
                    }
                    cycle = cycle.add_usize(1)?;
                }
            }
        }
    }

    /// Mark each micro-op in an InstructionPlan as busy in the schedule.
    #[inline(always)]
    fn mark_instruction_busy(&mut self, plan: &InstructionPlan) {
        let (first, second) = plan.as_micro_plans();
        self.mark_micro_busy(first);
        if let Some(second) = second {
            self.mark_micro_busy(second);
        }
    }
}

/// Detailed execution schedule for one micro-operation
///
/// Includes the [`Cycle`] it begins on, and the actual
/// execution port it was assigned.
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
struct MicroOpPlan {
    /// The Cycle this operation begins on
    cycle: Cycle,
    /// Index of the execution port it runs on
    port: ExecPortIndex,
}

/// Detailed execution schedule for one instruction
///
/// This is defined as either one or two micro-operations
/// scheduled on the same cycle.
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub(crate) struct InstructionPlan {
    /// The Cycle this whole instruction begins on
    cycle: Cycle,
    /// First execution port, always present
    first_port: ExecPortIndex,
    /// Optional second execution port ID, for two-uop instructions
    second_port: Option<ExecPortIndex>,
}

impl InstructionPlan {
    /// Get the Cycle this whole instruction begins on.
    #[inline(always)]
    pub(crate) fn cycle_issued(&self) -> Cycle {
        self.cycle
    }

    /// Calculate the cycle this instruction is complete by.
    #[inline(always)]
    pub(crate) fn cycle_retired(&self, op: Opcode) -> Cycle {
        self.cycle
            .add_usize(model::instruction_latency_cycles(op))
            .expect("instruction retired prior to end of schedule")
    }

    /// Convert this `InstructionPlan` back to one or two [`MicroOpPlan`]s.
    #[inline(always)]
    fn as_micro_plans(&self) -> (MicroOpPlan, Option<MicroOpPlan>) {
        (
            MicroOpPlan {
                cycle: self.cycle,
                port: self.first_port,
            },
            self.second_port.map(|port| MicroOpPlan {
                cycle: self.cycle,
                port,
            }),
        )
    }

    /// Bundle two [`MicroOpPlan`]s into an [`InstructionPlan`],
    /// if they are on matching cycles.
    ///
    /// Returns `Err(())` if the combination is not possible.
    #[inline(always)]
    fn from_micro_plans(first_op: MicroOpPlan, second_op: Option<MicroOpPlan>) -> Result<Self, ()> {
        let second_port = match second_op {
            None => None,
            Some(second_op) => {
                if first_op.cycle == second_op.cycle {
                    Some(second_op.port)
                } else {
                    return Err(());
                }
            }
        };
        Ok(Self {
            cycle: first_op.cycle,
            first_port: first_op.port,
            second_port,
        })
    }
}

/// Simple packed bit array implementation
///
/// This could use the `bitvec` crate if we cared to depend on it, but the
/// functionality we need is so tiny let's keep this simple.
#[derive(Debug, Clone, Eq, PartialEq)]
struct SimpleBitArray<const N: usize> {
    /// Array of words to use as a bit vector, in LSB-first order
    inner: [usize; N],
}

impl<const N: usize> Default for SimpleBitArray<N> {
    #[inline(always)]
    fn default() -> Self {
        Self {
            inner: [0_usize; N],
        }
    }
}

impl<const N: usize> SimpleBitArray<N> {
    /// Set or clear one bit in the array.
    ///
    /// Panics if the index is out of range.
    #[inline(always)]
    fn set(&mut self, index: usize, value: bool) {
        let word_size = usize::BITS as usize;
        let word_index = index / word_size;
        let bit_mask = 1 << (index % word_size);
        if value {
            self.inner[word_index] |= bit_mask;
        } else {
            self.inner[word_index] &= !bit_mask;
        }
    }

    /// Get one bit from the array.
    ///
    /// Panics if the index is out of range.
    #[inline(always)]
    fn get(&self, index: usize) -> bool {
        let word_size = usize::BITS as usize;
        let word_index = index / word_size;
        let bit_mask = 1 << (index % word_size);
        0 != (self.inner[word_index] & bit_mask)
    }
}