1extern crate blake2b_simd;
2
3use self::blake2b_simd::Params;
4use std::convert::TryInto;
5use std::fmt;
6use strum::Display;
7
8use super::common::{mulh, randomx_reciprocal, smulh, u64_from_u32_imm};
9use super::program::REG_NEEDS_DISPLACEMENT_IX;
10
11const RANDOMX_SUPERSCALAR_LATENCY: usize = 170;
12const CYCLE_MAP_SIZE: usize = RANDOMX_SUPERSCALAR_LATENCY + 4;
13const SUPERSCALAR_MAX_SIZE: usize = 3 * RANDOMX_SUPERSCALAR_LATENCY + 2;
14const LOOK_FORWARD_CYCLES: usize = 4;
15const MAX_THROWAWAY_COUNT: usize = 256;
16
17#[allow(nonstandard_style)]
18#[derive(Copy, Clone, Display, Debug, PartialEq)]
19pub enum ScOpcode {
20 INVALID = -1,
21 ISUB_R = 0,
22 IXOR_R = 1,
23 IADD_RS = 2,
24 IMUL_R = 3,
25 IROR_C = 4,
26 IADD_C7 = 5,
27 IXOR_C7 = 6,
28 IADD_C8 = 7,
29 IXOR_C8 = 8,
30 IADD_C9 = 9,
31 IXOR_C9 = 10,
32 IMULH_R = 11,
33 ISMULH_R = 12,
34 IMUL_RCP = 13,
35 COUNT = 14,
36}
37
38impl ScOpcode {
39 fn is_multiplication(self) -> bool {
40 self == ScOpcode::IMUL_R
41 || self == ScOpcode::IMULH_R
42 || self == ScOpcode::ISMULH_R
43 || self == ScOpcode::IMUL_RCP
44 }
45}
46
47#[derive(Copy, Clone)]
48struct RegisterInfo {
49 pub last_op_group: ScOpcode,
50 pub latency: usize,
51 pub last_op_par: i32,
52}
53
54impl RegisterInfo {
55 fn new() -> RegisterInfo {
56 RegisterInfo {
57 latency: 0,
58 last_op_group: ScOpcode::INVALID,
59 last_op_par: -1,
60 }
61 }
62}
63
64#[derive(Copy, Clone, Debug)]
65pub struct ScInstr<'a> {
66 pub info: &'a ScInstrInfo,
67 pub dst: i32,
68 pub src: i32,
69 pub mod_v: u8,
70 pub imm32: u32,
71 pub op_group: ScOpcode,
72 pub op_group_par: i32,
73 pub can_reuse: bool,
74 pub group_par_is_source: bool,
75}
76
77impl ScInstr<'_> {
78 fn null() -> ScInstr<'static> {
79 ScInstr {
80 info: &NOP,
81 dst: -1,
82 src: -1,
83 mod_v: 0,
84 imm32: 0,
85 op_group: ScOpcode::INVALID,
86 can_reuse: false,
87 group_par_is_source: false,
88 op_group_par: -1,
89 }
90 }
91
92 pub fn mod_shift(&self) -> u64 {
93 ((self.mod_v >> 2) % 4) as u64
94 }
95
96 fn select_destination(
97 &mut self,
98 cycle: usize,
99 allow_chain_mul: bool,
100 registers: &[RegisterInfo; 8],
101 generator: &mut Blake2Generator,
102 ) -> bool {
103 let mut available_registers = Vec::with_capacity(8);
104 for (i, v) in registers.iter().enumerate() {
105 if v.latency <= cycle
106 && (self.can_reuse || i as i32 != self.src)
107 && (allow_chain_mul
108 || self.op_group != ScOpcode::IMUL_R
109 || v.last_op_group != ScOpcode::IMUL_R)
110 && (v.last_op_group != self.op_group || v.last_op_par != self.op_group_par)
111 && (self.info.op != ScOpcode::IADD_RS || i != REG_NEEDS_DISPLACEMENT_IX)
112 {
113 available_registers.push(i);
114 }
115 }
116 self.select_register(&available_registers, generator, false)
117 }
118
119 fn select_source(
120 &mut self,
121 cycle: usize,
122 registers: &[RegisterInfo; 8],
123 generator: &mut Blake2Generator,
124 ) -> bool {
125 let mut available_registers = Vec::with_capacity(8);
126
127 for (i, v) in registers.iter().enumerate() {
128 if v.latency <= cycle {
129 available_registers.push(i);
130 }
131 }
132
133 if available_registers.len() == 2
134 && self.info.op == ScOpcode::IADD_RS
135 && (available_registers[0] == REG_NEEDS_DISPLACEMENT_IX
136 || available_registers[1] == REG_NEEDS_DISPLACEMENT_IX)
137 {
138 self.op_group_par = REG_NEEDS_DISPLACEMENT_IX as i32;
139 self.src = REG_NEEDS_DISPLACEMENT_IX as i32;
140 return true;
141 }
142
143 if self.select_register(&available_registers, generator, true) {
144 if self.group_par_is_source {
145 self.op_group_par = self.src;
146 }
147 return true;
148 }
149 false
150 }
151
152 fn select_register(
153 &mut self,
154 available_registers: &[usize],
155 generator: &mut Blake2Generator,
156 reg_src: bool,
157 ) -> bool {
158 if available_registers.is_empty() {
159 return false;
160 }
161 let index = if available_registers.len() > 1 {
162 generator.get_u32() as usize % available_registers.len()
163 } else {
164 0
165 };
166
167 if reg_src {
168 self.src = available_registers[index] as i32;
169 } else {
170 self.dst = available_registers[index] as i32;
171 }
172 true
173 }
174}
175
176static SLOT_3L: [&ScInstrInfo; 4] = [&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R];
177static SLOT_4: [&ScInstrInfo; 2] = [&IROR_C, &IADD_RS];
178static SLOT_7: [&ScInstrInfo; 2] = [&IXOR_C7, &IADD_C7];
179static SLOT_8: [&ScInstrInfo; 2] = [&IXOR_C8, &IADD_C8];
180static SLOT_9: [&ScInstrInfo; 2] = [&IXOR_C9, &IADD_C9];
181static SLOT_10: &ScInstrInfo = &IMUL_RCP;
182
183fn is_zero_or_power_of_2(v: u32) -> bool {
184 v & v.wrapping_sub(1) == 0
185}
186
187impl ScInstr<'_> {
188 pub fn create_for_slot<'a>(
189 generator: &mut Blake2Generator,
190 slot_size: u32,
191 fetch_type: u32,
192 is_last: bool,
193 ) -> ScInstr<'a> {
194 match slot_size {
195 3 => {
196 if is_last {
197 ScInstr::create(SLOT_3L[(generator.get_byte() & 3) as usize], generator)
198 } else {
199 ScInstr::create(SLOT_3L[(generator.get_byte() & 1) as usize], generator)
200 }
201 }
202 4 => {
203 if fetch_type == 4 && !is_last {
204 ScInstr::create(&IMUL_R, generator)
205 } else {
206 ScInstr::create(SLOT_4[(generator.get_byte() & 1) as usize], generator)
207 }
208 }
209 7 => ScInstr::create(SLOT_7[(generator.get_byte() & 1) as usize], generator),
210 8 => ScInstr::create(SLOT_8[(generator.get_byte() & 1) as usize], generator),
211 9 => ScInstr::create(SLOT_9[(generator.get_byte() & 1) as usize], generator),
212 10 => ScInstr::create(SLOT_10, generator),
213 _ => panic!("illegal slot_size {}", slot_size),
214 }
215 }
216
217 fn create<'a>(info: &'static ScInstrInfo, generator: &mut Blake2Generator) -> ScInstr<'a> {
218 match info.op {
219 ScOpcode::ISUB_R => ScInstr {
220 info,
221 dst: -1,
222 src: -1,
223 mod_v: 0,
224 imm32: 0,
225 op_group: ScOpcode::IADD_RS,
226 can_reuse: false,
227 group_par_is_source: true,
228 op_group_par: 0,
229 },
230 ScOpcode::IXOR_R => ScInstr {
231 info,
232 dst: -1,
233 src: -1,
234 mod_v: 0,
235 imm32: 0,
236 op_group: ScOpcode::IXOR_R,
237 can_reuse: false,
238 group_par_is_source: true,
239 op_group_par: 0,
240 },
241 ScOpcode::IADD_RS => ScInstr {
242 info,
243 dst: -1,
244 src: -1,
245 mod_v: generator.get_byte(),
246 imm32: 0,
247 op_group: ScOpcode::IADD_RS,
248 can_reuse: false,
249 group_par_is_source: true,
250 op_group_par: 0,
251 },
252 ScOpcode::IMUL_R => ScInstr {
253 info,
254 dst: -1,
255 src: -1,
256 mod_v: 0,
257 imm32: 0,
258 op_group: ScOpcode::IMUL_R,
259 can_reuse: false,
260 group_par_is_source: true,
261 op_group_par: 0,
262 },
263 ScOpcode::IROR_C => {
264 let mut imm32;
265 while {
266 imm32 = generator.get_byte() & 63;
267 imm32 == 0
268 } {}
269 ScInstr {
270 info,
271 dst: -1,
272 src: -1,
273 mod_v: 0,
274 imm32: imm32 as u32,
275 op_group: ScOpcode::IROR_C,
276 can_reuse: false,
277 group_par_is_source: true,
278 op_group_par: 0,
279 }
280 }
281 ScOpcode::IADD_C7 | ScOpcode::IADD_C8 | ScOpcode::IADD_C9 => ScInstr {
282 info,
283 dst: -1,
284 src: -1,
285 mod_v: 0,
286 imm32: generator.get_u32(),
287 op_group: ScOpcode::IADD_C7,
288 can_reuse: false,
289 group_par_is_source: false,
290 op_group_par: -1,
291 },
292 ScOpcode::IXOR_C7 | ScOpcode::IXOR_C8 | ScOpcode::IXOR_C9 => ScInstr {
293 info,
294 dst: -1,
295 src: -1,
296 mod_v: 0,
297 imm32: generator.get_u32(),
298 op_group: ScOpcode::IXOR_C7,
299 can_reuse: false,
300 group_par_is_source: false,
301 op_group_par: -1,
302 },
303 ScOpcode::IMULH_R => ScInstr {
304 info,
305 dst: -1,
306 src: -1,
307 mod_v: 0,
308 imm32: 0,
309 op_group: ScOpcode::IMULH_R,
310 group_par_is_source: true,
311 can_reuse: false,
312 op_group_par: generator.get_u32() as i32,
313 },
314 ScOpcode::ISMULH_R => ScInstr {
315 info,
316 dst: -1,
317 src: -1,
318 mod_v: 0,
319 imm32: 0,
320 op_group: ScOpcode::ISMULH_R,
321 group_par_is_source: true,
322 can_reuse: false,
323 op_group_par: generator.get_u32() as i32,
324 },
325 ScOpcode::IMUL_RCP => {
326 let mut imm32;
327 while {
328 imm32 = generator.get_u32();
329 is_zero_or_power_of_2(imm32)
330 } {}
331 ScInstr {
332 info,
333 dst: -1,
334 src: -1,
335 mod_v: 0,
336 imm32,
337 op_group: ScOpcode::IMUL_RCP,
338 can_reuse: false,
339 group_par_is_source: true,
340 op_group_par: -1,
341 }
342 }
343 ScOpcode::INVALID | ScOpcode::COUNT => panic!("invalid opcode {} here", info.op),
344 }
345 }
346}
347
348#[derive(Copy, Clone, PartialEq, Debug)]
349#[repr(u8)]
350pub enum ExecutionPort {
351 NULL = 0,
352 P0 = 1,
353 P1 = 2,
354 P5 = 4,
355 P01 = ExecutionPort::P0 as u8 | ExecutionPort::P1 as u8,
356 P05 = ExecutionPort::P0 as u8 | ExecutionPort::P5 as u8,
357 P015 = ExecutionPort::P0 as u8 | ExecutionPort::P1 as u8 | ExecutionPort::P5 as u8,
358}
359
360impl ExecutionPort {
361 fn is(self, check: ExecutionPort) -> bool {
362 (self as u8 & check as u8) != 0
363 }
364}
365
366#[derive(Debug)]
367pub struct ScMacroOp {
368 #[allow(dead_code)]
369 name: &'static str,
370 size: usize,
371 latency: usize,
372 uop1: ExecutionPort,
373 uop2: ExecutionPort,
374 dependent: bool,
375}
376
377impl ScMacroOp {
378 pub const fn new(
379 name: &'static str,
380 size: usize,
381 latency: usize,
382 uop1: ExecutionPort,
383 uop2: ExecutionPort,
384 ) -> ScMacroOp {
385 ScMacroOp {
386 name,
387 size,
388 latency,
389 uop1,
390 uop2,
391 dependent: false,
392 }
393 }
394 pub const fn new_dep(
395 name: &'static str,
396 size: usize,
397 latency: usize,
398 uop1: ExecutionPort,
399 uop2: ExecutionPort,
400 ) -> ScMacroOp {
401 ScMacroOp {
402 name,
403 size,
404 latency,
405 uop1,
406 uop2,
407 dependent: true,
408 }
409 }
410
411 pub fn is_eliminated(&self) -> bool {
412 self.uop1 == ExecutionPort::NULL
413 }
414
415 pub fn is_simple(&self) -> bool {
416 self.uop2 == ExecutionPort::NULL
417 }
418}
419
420static MOP_SUB_RR: ScMacroOp =
421 ScMacroOp::new("SUB_RR", 3, 1, ExecutionPort::P015, ExecutionPort::NULL);
422static MOP_XOR_RR: ScMacroOp =
423 ScMacroOp::new("XOR_RR", 3, 1, ExecutionPort::P015, ExecutionPort::NULL);
424static MOP_IMUL_R: ScMacroOp = ScMacroOp::new("IMUL_R", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
425static MOP_MUL_R: ScMacroOp = ScMacroOp::new("MUL_R", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
426static MOP_MOV_RR: ScMacroOp =
427 ScMacroOp::new("MOV_RR", 3, 1, ExecutionPort::NULL, ExecutionPort::NULL);
428
429static MOP_LEA_SIB: ScMacroOp =
430 ScMacroOp::new("LEA_SIB", 4, 1, ExecutionPort::P01, ExecutionPort::NULL);
431static MOP_IMUL_RR_DEP: ScMacroOp =
432 ScMacroOp::new_dep("IMUL_RR_DEP", 4, 3, ExecutionPort::P1, ExecutionPort::NULL);
433static MOP_ROR_RI: ScMacroOp =
434 ScMacroOp::new("ROR_RI", 4, 1, ExecutionPort::P05, ExecutionPort::NULL);
435
436static MOP_ADD_RI: ScMacroOp =
437 ScMacroOp::new("ADD_RI", 7, 1, ExecutionPort::P015, ExecutionPort::NULL);
438static MOP_XOR_RI: ScMacroOp =
439 ScMacroOp::new("XOR_RI", 7, 1, ExecutionPort::P015, ExecutionPort::NULL);
440
441static MOP_MOV_RI64: ScMacroOp =
442 ScMacroOp::new("MOV_RI64", 10, 1, ExecutionPort::P015, ExecutionPort::NULL);
443
444static MOP_IMUL_RR: ScMacroOp =
445 ScMacroOp::new("IMUL_RR", 4, 3, ExecutionPort::P1, ExecutionPort::NULL);
446
447#[allow(nonstandard_style)]
448#[derive(Debug)]
449pub struct ScInstrInfo {
450 pub op: ScOpcode,
451 pub macro_ops: &'static [&'static ScMacroOp],
452 pub result_op: usize,
453 pub src_op: i32,
454 pub dst_op: i32,
455}
456
457impl ScInstrInfo {
458 pub const fn new(
459 op: ScOpcode,
460 macro_ops: &'static [&ScMacroOp],
461 result_op: usize,
462 dst_op: i32,
463 src_op: i32,
464 ) -> ScInstrInfo {
465 ScInstrInfo {
466 op,
467 macro_ops,
468 result_op,
469 src_op,
470 dst_op,
471 }
472 }
473
474 pub fn size(&self) -> usize {
475 self.macro_ops.len()
476 }
477
478 pub fn macro_op(&self, i: usize) -> &'static ScMacroOp {
479 self.macro_ops[i]
480 }
481}
482
483static NOP: ScInstrInfo = ScInstrInfo::new(ScOpcode::INVALID, &[], 0, 0, 0);
484
485static ISUB_R: ScInstrInfo = ScInstrInfo::new(ScOpcode::ISUB_R, &[&MOP_SUB_RR], 0, 0, 0);
486static IXOR_R: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_R, &[&MOP_XOR_RR], 0, 0, 0);
487static IADD_RS: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_RS, &[&MOP_LEA_SIB], 0, 0, 0);
488static IMUL_R: ScInstrInfo = ScInstrInfo::new(ScOpcode::IMUL_R, &[&MOP_IMUL_RR], 0, 0, 0);
489static IROR_C: ScInstrInfo = ScInstrInfo::new(ScOpcode::IROR_C, &[&MOP_ROR_RI], 0, 0, -1);
490
491static IADD_C7: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_C7, &[&MOP_ADD_RI], 0, 0, -1);
492static IXOR_C7: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_C7, &[&MOP_XOR_RI], 0, 0, -1);
493static IADD_C8: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_C8, &[&MOP_ADD_RI], 0, 0, -1);
494static IXOR_C8: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_C8, &[&MOP_XOR_RI], 0, 0, -1);
495static IADD_C9: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_C9, &[&MOP_ADD_RI], 0, 0, -1);
496static IXOR_C9: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_C9, &[&MOP_XOR_RI], 0, 0, -1);
497
498static IMULH_R: ScInstrInfo = ScInstrInfo::new(
499 ScOpcode::IMULH_R,
500 &[&MOP_MOV_RR, &MOP_MUL_R, &MOP_MOV_RR],
501 1,
502 0,
503 1,
504);
505static ISMULH_R: ScInstrInfo = ScInstrInfo::new(
506 ScOpcode::ISMULH_R,
507 &[&MOP_MOV_RR, &MOP_IMUL_R, &MOP_MOV_RR],
508 1,
509 0,
510 1,
511);
512static IMUL_RCP: ScInstrInfo = ScInstrInfo::new(
513 ScOpcode::IMUL_RCP,
514 &[&MOP_MOV_RI64, &MOP_IMUL_RR_DEP],
515 1,
516 1,
517 -1,
518);
519
520const BLAKE_GEN_DATA_LEN: usize = 64;
521pub struct Blake2Generator {
522 index: usize,
523 data: [u8; BLAKE_GEN_DATA_LEN],
524 gen_params: Params,
525}
526
527impl Blake2Generator {
528 pub fn new(seed: &[u8], nonce: u32) -> Blake2Generator {
529 debug_assert!(seed.len() <= BLAKE_GEN_DATA_LEN - 4);
530 let mut params = Params::new();
531 params.hash_length(BLAKE_GEN_DATA_LEN);
532
533 let mut key: [u8; 60] = [0; 60];
534 key[..seed.len()].copy_from_slice(seed);
535
536 let mut data: [u8; BLAKE_GEN_DATA_LEN] = [0; BLAKE_GEN_DATA_LEN];
537 data[..BLAKE_GEN_DATA_LEN - 4].copy_from_slice(&key);
538 data[BLAKE_GEN_DATA_LEN - 4..BLAKE_GEN_DATA_LEN].copy_from_slice(&nonce.to_le_bytes());
539
540 Blake2Generator {
541 index: BLAKE_GEN_DATA_LEN,
542 data,
543 gen_params: params,
544 }
545 }
546
547 pub fn get_byte(&mut self) -> u8 {
548 self.check_data(1);
549 let v = self.data[self.index];
550 self.index += 1;
551 v
552 }
553
554 pub fn get_u32(&mut self) -> u32 {
555 self.check_data(4);
556 let v = u32::from_le_bytes(self.data[self.index..(self.index + 4)].try_into().unwrap());
557 self.index += 4;
558 v
559 }
560 fn check_data(&mut self, needed: usize) {
561 if self.index + needed > BLAKE_GEN_DATA_LEN {
562 let out = self.gen_params.hash(&self.data);
563 self.data = *out.as_array();
564 self.index = 0;
565 }
566 }
567}
568
569pub struct DecoderBuffer {
570 index: u32,
571 counts: &'static [u32],
572}
573
574static BUFFER_484: DecoderBuffer = DecoderBuffer {
575 index: 0,
576 counts: &[4, 8, 4],
577};
578static BUFFER_7333: DecoderBuffer = DecoderBuffer {
579 index: 1,
580 counts: &[7, 3, 3, 3],
581};
582static BUFFER_3733: DecoderBuffer = DecoderBuffer {
583 index: 2,
584 counts: &[3, 7, 3, 3],
585};
586static BUFFER_493: DecoderBuffer = DecoderBuffer {
587 index: 3,
588 counts: &[4, 9, 3],
589};
590static BUFFER_4444: DecoderBuffer = DecoderBuffer {
591 index: 4,
592 counts: &[4, 4, 4, 4],
593};
594static BUFFFER_3310: DecoderBuffer = DecoderBuffer {
595 index: 5,
596 counts: &[3, 3, 10],
597};
598
599static DECODE_BUFFERS: [&DecoderBuffer; 4] = [&BUFFER_484, &BUFFER_7333, &BUFFER_3733, &BUFFER_493];
600
601impl DecoderBuffer {
602 fn initial() -> DecoderBuffer {
603 DecoderBuffer {
604 index: 0,
605 counts: &[],
606 }
607 }
608
609 pub fn size(&self) -> usize {
610 self.counts.len()
611 }
612
613 pub fn fetch_next(
614 &self,
615 instr: &ScInstr,
616 decode_cycle: usize,
617 mul_count: usize,
618 generator: &mut Blake2Generator,
619 ) -> &'static DecoderBuffer {
620 if instr.info.op == ScOpcode::IMULH_R || instr.info.op == ScOpcode::ISMULH_R {
621 return &BUFFFER_3310;
622 }
623 if mul_count < decode_cycle + 1 {
624 return &BUFFER_4444;
625 }
626 if instr.info.op == ScOpcode::IMUL_RCP {
627 return if generator.get_byte() & 0x1 == 1 {
628 &BUFFER_484
629 } else {
630 &BUFFER_493
631 };
632 }
633 let ix = generator.get_byte();
634 DECODE_BUFFERS[(ix & 3) as usize]
635 }
636}
637
638pub struct ScProgram<'a> {
639 pub prog: Vec<ScInstr<'a>>,
640 pub asic_latencies: Vec<usize>,
641 pub cpu_latencies: Vec<usize>,
642 pub address_reg: usize,
643 pub ipc: f64,
644 pub code_size: usize,
645 pub macro_ops: usize,
646 pub decode_cycles: usize,
647 pub cpu_latency: usize,
648 pub asic_latency: usize,
649 pub mul_count: usize,
650}
651
652impl fmt::Display for ScProgram<'_> {
653 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
654 for instr in &self.prog {
655 writeln!(
656 f,
657 "op: {}, src: {}, dst: {}",
658 instr.info.op, instr.src, instr.dst
659 )
660 .unwrap();
661 }
662 Ok(())
663 }
664}
665
666impl ScProgram<'_> {
667 pub fn generate(generator: &mut Blake2Generator) -> ScProgram<'static> {
668 let mut prog = Vec::with_capacity(SUPERSCALAR_MAX_SIZE);
669
670 let mut port_busy = [[ExecutionPort::NULL; 3]; CYCLE_MAP_SIZE];
671 let mut registers = [RegisterInfo::new(); 8];
672
673 let mut macro_op_index = 0;
674 let mut code_size = 0;
675 let mut macro_op_count = 0;
676 let mut cycle = 0;
677 let mut dep_cycle = 0;
678 let mut retire_cycle = 0;
679 let mut ports_saturated = false;
680 let mut program_size = 0;
681 let mut mul_count = 0;
682 let mut decode_cycle = 0;
683 let mut throw_away_count = 0;
684
685 let mut decode_buffer = &DecoderBuffer::initial();
686 let mut current_instr = ScInstr::null();
687 while decode_cycle < RANDOMX_SUPERSCALAR_LATENCY
688 && !ports_saturated
689 && program_size < SUPERSCALAR_MAX_SIZE
690 {
691 decode_buffer = decode_buffer.fetch_next(¤t_instr, decode_cycle, mul_count, generator);
692 let mut buffer_index = 0;
693 while buffer_index < decode_buffer.size() {
694 let top_cycle = cycle;
695 if macro_op_index >= current_instr.info.size() {
696 if ports_saturated || program_size >= SUPERSCALAR_MAX_SIZE {
697 break;
698 }
699
700 current_instr = ScInstr::create_for_slot(
701 generator,
702 decode_buffer.counts[buffer_index],
703 decode_buffer.index,
704 decode_buffer.size() == buffer_index + 1,
705 );
706 macro_op_index = 0
707 }
708
709 let mop = current_instr.info.macro_op(macro_op_index);
710 let schedule_cycle_mop = schedule_mop(false, mop, &mut port_busy, cycle, dep_cycle);
711 if schedule_cycle_mop.is_none() {
712 ports_saturated = true;
713 break;
714 }
715
716 let mut schedule_cycle = schedule_cycle_mop.unwrap();
717 if macro_op_index as i32 == current_instr.info.src_op {
718 let mut forward = 0;
719 while forward < LOOK_FORWARD_CYCLES
720 && !current_instr.select_source(schedule_cycle, ®isters, generator)
721 {
722 schedule_cycle += 1;
723 cycle += 1;
724 forward += 1;
725 }
726
727 if forward == LOOK_FORWARD_CYCLES {
728 if throw_away_count < MAX_THROWAWAY_COUNT {
729 throw_away_count += 1;
730 macro_op_index = current_instr.info.size();
731 continue;
732 }
733 current_instr = ScInstr::null();
734 break;
735 }
736 }
737 if macro_op_index as i32 == current_instr.info.dst_op {
738 let mut forward = 0;
739 while forward < LOOK_FORWARD_CYCLES
740 && !current_instr.select_destination(
741 schedule_cycle,
742 throw_away_count > 0,
743 ®isters,
744 generator,
745 ) {
746 schedule_cycle += 1;
747 cycle += 1;
748 forward += 1;
749 }
750 if forward == LOOK_FORWARD_CYCLES {
751 if throw_away_count < MAX_THROWAWAY_COUNT {
752 throw_away_count += 1;
753 macro_op_index = current_instr.info.size();
754 continue;
755 }
756 current_instr = ScInstr::null();
757 break;
758 }
759 }
760 throw_away_count = 0;
761
762 let schedule_cycle_mop =
763 schedule_mop(true, mop, &mut port_busy, schedule_cycle, schedule_cycle);
764 if schedule_cycle_mop.is_none() {
765 ports_saturated = true;
766 break;
767 }
768 schedule_cycle = schedule_cycle_mop.unwrap();
769 dep_cycle = schedule_cycle + mop.latency;
770
771 if macro_op_index == current_instr.info.result_op {
772 let ri = &mut registers[current_instr.dst as usize];
773 retire_cycle = dep_cycle;
774 ri.latency = retire_cycle;
775 ri.last_op_group = current_instr.op_group;
776 ri.last_op_par = current_instr.op_group_par;
777 }
778 code_size += mop.size;
779 buffer_index += 1;
780 macro_op_index += 1;
781 macro_op_count += 1;
782
783 if schedule_cycle >= RANDOMX_SUPERSCALAR_LATENCY {
784 ports_saturated = true;
785 }
786 cycle = top_cycle;
787
788 if macro_op_index >= current_instr.info.size() {
789 if current_instr.info.op.is_multiplication() {
790 mul_count += 1;
791 }
792 prog.push(current_instr);
793 program_size += 1;
794 }
795 }
796 cycle += 1;
797 decode_cycle += 1;
798 }
799
800 let ipc = macro_op_count as f64 / retire_cycle as f64;
801 let mut asic_latencies = vec![0; 8];
802 for &instr in prog.iter().take(program_size) {
803 let lat_dst = asic_latencies[instr.dst as usize] + 1;
804 let lat_src = if instr.src < 0 || instr.src == instr.dst {
805 0
806 } else {
807 asic_latencies[instr.src as usize] + 1
808 };
809 asic_latencies[instr.dst as usize] = lat_dst.max(lat_src);
810 }
811
812 let mut asic_latency_max = 0;
813 let mut address_reg = 0;
814 let mut cpu_latencies = vec![0; 8];
815 for i in 0..8 {
816 if asic_latencies[i] > asic_latency_max {
817 asic_latency_max = asic_latencies[i];
818 address_reg = i;
819 }
820 cpu_latencies[i] = registers[i].latency;
821 }
822
823 ScProgram {
824 prog,
825 asic_latencies,
826 cpu_latencies,
827 address_reg,
828 ipc,
829 mul_count,
830 cpu_latency: retire_cycle,
831 asic_latency: asic_latency_max,
832 code_size,
833 macro_ops: macro_op_count,
834 decode_cycles: decode_cycle,
835 }
836 }
837
838 pub fn execute(&self, ds: &mut [u64; 8]) {
839 for instr in &self.prog {
840 let dst = instr.dst as usize;
841 let src = instr.src as usize;
842 match instr.info.op {
843 ScOpcode::ISUB_R => ds[dst] = ds[dst].wrapping_sub(ds[src]),
844 ScOpcode::IXOR_R => ds[dst] ^= ds[src],
845 ScOpcode::IADD_RS => ds[dst] = ds[dst].wrapping_add(ds[src] << instr.mod_shift()),
846 ScOpcode::IMUL_R => {
847 ds[dst] = ds[dst].wrapping_mul(ds[src]);
848 }
849 ScOpcode::IROR_C => ds[dst] = ds[dst].rotate_right(instr.imm32),
850 ScOpcode::IADD_C7 | ScOpcode::IADD_C8 | ScOpcode::IADD_C9 => {
851 ds[dst] = ds[dst].wrapping_add(u64_from_u32_imm(instr.imm32));
852 }
853 ScOpcode::IXOR_C7 | ScOpcode::IXOR_C8 | ScOpcode::IXOR_C9 => {
854 ds[dst] ^= u64_from_u32_imm(instr.imm32);
855 }
856 ScOpcode::IMULH_R => ds[dst] = mulh(ds[dst], ds[src]),
857 ScOpcode::ISMULH_R => ds[dst] = smulh(ds[dst], ds[src]),
858 ScOpcode::IMUL_RCP => {
859 ds[dst] = ds[dst].wrapping_mul(randomx_reciprocal(instr.imm32 as u64))
860 }
861 ScOpcode::COUNT => panic!("COUNT execution tried"),
862 ScOpcode::INVALID => panic!("INVALLID execution tried"),
863 }
864 }
865 }
866}
867
868#[allow(clippy::unnecessary_unwrap)]
869fn schedule_mop(
870 commit: bool,
871 mop: &ScMacroOp,
872 port_busy: &mut [[ExecutionPort; 3]; CYCLE_MAP_SIZE],
873 cycle_in: usize,
874 dep_cycle: usize,
875) -> Option<usize> {
876 let mut cycle = if mop.dependent {
877 usize::max(cycle_in, dep_cycle)
878 } else {
879 cycle_in
880 };
881
882 if mop.is_eliminated() {
883 return Some(cycle);
884 } else if mop.is_simple() {
885 return schedule_uop(commit, mop.uop1, port_busy, cycle);
886 } else {
887 while cycle < CYCLE_MAP_SIZE {
888 let cycle_1 = schedule_uop(false, mop.uop1, port_busy, cycle);
889 let cycle_2 = schedule_uop(false, mop.uop2, port_busy, cycle);
890
891 if cycle_1.is_some() && cycle_1 == cycle_2 {
892 if commit {
893 schedule_uop(true, mop.uop1, port_busy, cycle_1.unwrap());
894 schedule_uop(true, mop.uop2, port_busy, cycle_2.unwrap());
895 }
896 return cycle_1;
897 }
898 cycle += 1
899 }
900 }
901 None
902}
903
904fn schedule_uop(
905 commit: bool,
906 uop: ExecutionPort,
907 port_busy: &mut [[ExecutionPort; 3]; CYCLE_MAP_SIZE],
908 cycle_in: usize,
909) -> Option<usize> {
910 let mut cycle = cycle_in;
911 while cycle < CYCLE_MAP_SIZE {
912 if uop.is(ExecutionPort::P5) && port_busy[cycle][2] == ExecutionPort::NULL {
913 if commit {
914 port_busy[cycle][2] = uop;
915 }
916 return Some(cycle);
917 }
918 if uop.is(ExecutionPort::P0) && port_busy[cycle][0] == ExecutionPort::NULL {
919 if commit {
920 port_busy[cycle][0] = uop;
921 }
922 return Some(cycle);
923 }
924 if uop.is(ExecutionPort::P1) && port_busy[cycle][1] == ExecutionPort::NULL {
925 if commit {
926 port_busy[cycle][1] = uop;
927 }
928 return Some(cycle);
929 }
930 cycle += 1
931 }
932 None
933}