Skip to main content

rustdom_x/
superscalar.rs

1extern crate blake2b_simd;
2
3use self::blake2b_simd::Params;
4use std::convert::TryInto;
5use std::fmt;
6use strum::Display;
7
8use super::common::{mulh, randomx_reciprocal, smulh, u64_from_u32_imm};
9use super::program::REG_NEEDS_DISPLACEMENT_IX;
10
11const RANDOMX_SUPERSCALAR_LATENCY: usize = 170;
12const CYCLE_MAP_SIZE: usize = RANDOMX_SUPERSCALAR_LATENCY + 4;
13const SUPERSCALAR_MAX_SIZE: usize = 3 * RANDOMX_SUPERSCALAR_LATENCY + 2;
14const LOOK_FORWARD_CYCLES: usize = 4;
15const MAX_THROWAWAY_COUNT: usize = 256;
16
17#[allow(nonstandard_style)]
18#[derive(Copy, Clone, Display, Debug, PartialEq)]
19pub enum ScOpcode {
20	INVALID = -1,
21	ISUB_R = 0,
22	IXOR_R = 1,
23	IADD_RS = 2,
24	IMUL_R = 3,
25	IROR_C = 4,
26	IADD_C7 = 5,
27	IXOR_C7 = 6,
28	IADD_C8 = 7,
29	IXOR_C8 = 8,
30	IADD_C9 = 9,
31	IXOR_C9 = 10,
32	IMULH_R = 11,
33	ISMULH_R = 12,
34	IMUL_RCP = 13,
35	COUNT = 14,
36}
37
38impl ScOpcode {
39	fn is_multiplication(self) -> bool {
40		self == ScOpcode::IMUL_R
41			|| self == ScOpcode::IMULH_R
42			|| self == ScOpcode::ISMULH_R
43			|| self == ScOpcode::IMUL_RCP
44	}
45}
46
47#[derive(Copy, Clone)]
48struct RegisterInfo {
49	pub last_op_group: ScOpcode,
50	pub latency: usize,
51	pub last_op_par: i32,
52}
53
54impl RegisterInfo {
55	fn new() -> RegisterInfo {
56		RegisterInfo {
57			latency: 0,
58			last_op_group: ScOpcode::INVALID,
59			last_op_par: -1,
60		}
61	}
62}
63
64#[derive(Copy, Clone, Debug)]
65pub struct ScInstr<'a> {
66	pub info: &'a ScInstrInfo,
67	pub dst: i32,
68	pub src: i32,
69	pub mod_v: u8,
70	pub imm32: u32,
71	pub op_group: ScOpcode,
72	pub op_group_par: i32,
73	pub can_reuse: bool,
74	pub group_par_is_source: bool,
75}
76
77impl ScInstr<'_> {
78	fn null() -> ScInstr<'static> {
79		ScInstr {
80			info: &NOP,
81			dst: -1,
82			src: -1,
83			mod_v: 0,
84			imm32: 0,
85			op_group: ScOpcode::INVALID,
86			can_reuse: false,
87			group_par_is_source: false,
88			op_group_par: -1,
89		}
90	}
91
92	pub fn mod_shift(&self) -> u64 {
93		((self.mod_v >> 2) % 4) as u64
94	}
95
96	fn select_destination(
97		&mut self,
98		cycle: usize,
99		allow_chain_mul: bool,
100		registers: &[RegisterInfo; 8],
101		generator: &mut Blake2Generator,
102	) -> bool {
103		let mut available_registers = Vec::with_capacity(8);
104		for (i, v) in registers.iter().enumerate() {
105			if v.latency <= cycle
106				&& (self.can_reuse || i as i32 != self.src)
107				&& (allow_chain_mul
108					|| self.op_group != ScOpcode::IMUL_R
109					|| v.last_op_group != ScOpcode::IMUL_R)
110				&& (v.last_op_group != self.op_group || v.last_op_par != self.op_group_par)
111				&& (self.info.op != ScOpcode::IADD_RS || i != REG_NEEDS_DISPLACEMENT_IX)
112			{
113				available_registers.push(i);
114			}
115		}
116		self.select_register(&available_registers, generator, false)
117	}
118
119	fn select_source(
120		&mut self,
121		cycle: usize,
122		registers: &[RegisterInfo; 8],
123		generator: &mut Blake2Generator,
124	) -> bool {
125		let mut available_registers = Vec::with_capacity(8);
126
127		for (i, v) in registers.iter().enumerate() {
128			if v.latency <= cycle {
129				available_registers.push(i);
130			}
131		}
132
133		if available_registers.len() == 2
134			&& self.info.op == ScOpcode::IADD_RS
135			&& (available_registers[0] == REG_NEEDS_DISPLACEMENT_IX
136				|| available_registers[1] == REG_NEEDS_DISPLACEMENT_IX)
137		{
138			self.op_group_par = REG_NEEDS_DISPLACEMENT_IX as i32;
139			self.src = REG_NEEDS_DISPLACEMENT_IX as i32;
140			return true;
141		}
142
143		if self.select_register(&available_registers, generator, true) {
144			if self.group_par_is_source {
145				self.op_group_par = self.src;
146			}
147			return true;
148		}
149		false
150	}
151
152	fn select_register(
153		&mut self,
154		available_registers: &[usize],
155		generator: &mut Blake2Generator,
156		reg_src: bool,
157	) -> bool {
158		if available_registers.is_empty() {
159			return false;
160		}
161		let index = if available_registers.len() > 1 {
162			generator.get_u32() as usize % available_registers.len()
163		} else {
164			0
165		};
166
167		if reg_src {
168			self.src = available_registers[index] as i32;
169		} else {
170			self.dst = available_registers[index] as i32;
171		}
172		true
173	}
174}
175
176static SLOT_3L: [&ScInstrInfo; 4] = [&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R];
177static SLOT_4: [&ScInstrInfo; 2] = [&IROR_C, &IADD_RS];
178static SLOT_7: [&ScInstrInfo; 2] = [&IXOR_C7, &IADD_C7];
179static SLOT_8: [&ScInstrInfo; 2] = [&IXOR_C8, &IADD_C8];
180static SLOT_9: [&ScInstrInfo; 2] = [&IXOR_C9, &IADD_C9];
181static SLOT_10: &ScInstrInfo = &IMUL_RCP;
182
183fn is_zero_or_power_of_2(v: u32) -> bool {
184	v & v.wrapping_sub(1) == 0
185}
186
187impl ScInstr<'_> {
188	pub fn create_for_slot<'a>(
189		generator: &mut Blake2Generator,
190		slot_size: u32,
191		fetch_type: u32,
192		is_last: bool,
193	) -> ScInstr<'a> {
194		match slot_size {
195			3 => {
196				if is_last {
197					ScInstr::create(SLOT_3L[(generator.get_byte() & 3) as usize], generator)
198				} else {
199					ScInstr::create(SLOT_3L[(generator.get_byte() & 1) as usize], generator)
200				}
201			}
202			4 => {
203				if fetch_type == 4 && !is_last {
204					ScInstr::create(&IMUL_R, generator)
205				} else {
206					ScInstr::create(SLOT_4[(generator.get_byte() & 1) as usize], generator)
207				}
208			}
209			7 => ScInstr::create(SLOT_7[(generator.get_byte() & 1) as usize], generator),
210			8 => ScInstr::create(SLOT_8[(generator.get_byte() & 1) as usize], generator),
211			9 => ScInstr::create(SLOT_9[(generator.get_byte() & 1) as usize], generator),
212			10 => ScInstr::create(SLOT_10, generator),
213			_ => panic!("illegal slot_size {}", slot_size),
214		}
215	}
216
217	fn create<'a>(info: &'static ScInstrInfo, generator: &mut Blake2Generator) -> ScInstr<'a> {
218		match info.op {
219			ScOpcode::ISUB_R => ScInstr {
220				info,
221				dst: -1,
222				src: -1,
223				mod_v: 0,
224				imm32: 0,
225				op_group: ScOpcode::IADD_RS,
226				can_reuse: false,
227				group_par_is_source: true,
228				op_group_par: 0,
229			},
230			ScOpcode::IXOR_R => ScInstr {
231				info,
232				dst: -1,
233				src: -1,
234				mod_v: 0,
235				imm32: 0,
236				op_group: ScOpcode::IXOR_R,
237				can_reuse: false,
238				group_par_is_source: true,
239				op_group_par: 0,
240			},
241			ScOpcode::IADD_RS => ScInstr {
242				info,
243				dst: -1,
244				src: -1,
245				mod_v: generator.get_byte(),
246				imm32: 0,
247				op_group: ScOpcode::IADD_RS,
248				can_reuse: false,
249				group_par_is_source: true,
250				op_group_par: 0,
251			},
252			ScOpcode::IMUL_R => ScInstr {
253				info,
254				dst: -1,
255				src: -1,
256				mod_v: 0,
257				imm32: 0,
258				op_group: ScOpcode::IMUL_R,
259				can_reuse: false,
260				group_par_is_source: true,
261				op_group_par: 0,
262			},
263			ScOpcode::IROR_C => {
264				let mut imm32;
265				while {
266					imm32 = generator.get_byte() & 63;
267					imm32 == 0
268				} {}
269				ScInstr {
270					info,
271					dst: -1,
272					src: -1,
273					mod_v: 0,
274					imm32: imm32 as u32,
275					op_group: ScOpcode::IROR_C,
276					can_reuse: false,
277					group_par_is_source: true,
278					op_group_par: 0,
279				}
280			}
281			ScOpcode::IADD_C7 | ScOpcode::IADD_C8 | ScOpcode::IADD_C9 => ScInstr {
282				info,
283				dst: -1,
284				src: -1,
285				mod_v: 0,
286				imm32: generator.get_u32(),
287				op_group: ScOpcode::IADD_C7,
288				can_reuse: false,
289				group_par_is_source: false,
290				op_group_par: -1,
291			},
292			ScOpcode::IXOR_C7 | ScOpcode::IXOR_C8 | ScOpcode::IXOR_C9 => ScInstr {
293				info,
294				dst: -1,
295				src: -1,
296				mod_v: 0,
297				imm32: generator.get_u32(),
298				op_group: ScOpcode::IXOR_C7,
299				can_reuse: false,
300				group_par_is_source: false,
301				op_group_par: -1,
302			},
303			ScOpcode::IMULH_R => ScInstr {
304				info,
305				dst: -1,
306				src: -1,
307				mod_v: 0,
308				imm32: 0,
309				op_group: ScOpcode::IMULH_R,
310				group_par_is_source: true,
311				can_reuse: false,
312				op_group_par: generator.get_u32() as i32,
313			},
314			ScOpcode::ISMULH_R => ScInstr {
315				info,
316				dst: -1,
317				src: -1,
318				mod_v: 0,
319				imm32: 0,
320				op_group: ScOpcode::ISMULH_R,
321				group_par_is_source: true,
322				can_reuse: false,
323				op_group_par: generator.get_u32() as i32,
324			},
325			ScOpcode::IMUL_RCP => {
326				let mut imm32;
327				while {
328					imm32 = generator.get_u32();
329					is_zero_or_power_of_2(imm32)
330				} {}
331				ScInstr {
332					info,
333					dst: -1,
334					src: -1,
335					mod_v: 0,
336					imm32,
337					op_group: ScOpcode::IMUL_RCP,
338					can_reuse: false,
339					group_par_is_source: true,
340					op_group_par: -1,
341				}
342			}
343			ScOpcode::INVALID | ScOpcode::COUNT => panic!("invalid opcode {} here", info.op),
344		}
345	}
346}
347
348#[derive(Copy, Clone, PartialEq, Debug)]
349#[repr(u8)]
350pub enum ExecutionPort {
351	NULL = 0,
352	P0 = 1,
353	P1 = 2,
354	P5 = 4,
355	P01 = ExecutionPort::P0 as u8 | ExecutionPort::P1 as u8,
356	P05 = ExecutionPort::P0 as u8 | ExecutionPort::P5 as u8,
357	P015 = ExecutionPort::P0 as u8 | ExecutionPort::P1 as u8 | ExecutionPort::P5 as u8,
358}
359
360impl ExecutionPort {
361	fn is(self, check: ExecutionPort) -> bool {
362		(self as u8 & check as u8) != 0
363	}
364}
365
366#[derive(Debug)]
367pub struct ScMacroOp {
368    #[allow(dead_code)]
369	name: &'static str,
370	size: usize,
371	latency: usize,
372	uop1: ExecutionPort,
373	uop2: ExecutionPort,
374	dependent: bool,
375}
376
377impl ScMacroOp {
378	pub const fn new(
379		name: &'static str,
380		size: usize,
381		latency: usize,
382		uop1: ExecutionPort,
383		uop2: ExecutionPort,
384	) -> ScMacroOp {
385		ScMacroOp {
386			name,
387			size,
388			latency,
389			uop1,
390			uop2,
391			dependent: false,
392		}
393	}
394	pub const fn new_dep(
395		name: &'static str,
396		size: usize,
397		latency: usize,
398		uop1: ExecutionPort,
399		uop2: ExecutionPort,
400	) -> ScMacroOp {
401		ScMacroOp {
402			name,
403			size,
404			latency,
405			uop1,
406			uop2,
407			dependent: true,
408		}
409	}
410
411	pub fn is_eliminated(&self) -> bool {
412		self.uop1 == ExecutionPort::NULL
413	}
414
415	pub fn is_simple(&self) -> bool {
416		self.uop2 == ExecutionPort::NULL
417	}
418}
419
420static MOP_SUB_RR: ScMacroOp =
421	ScMacroOp::new("SUB_RR", 3, 1, ExecutionPort::P015, ExecutionPort::NULL);
422static MOP_XOR_RR: ScMacroOp =
423	ScMacroOp::new("XOR_RR", 3, 1, ExecutionPort::P015, ExecutionPort::NULL);
424static MOP_IMUL_R: ScMacroOp = ScMacroOp::new("IMUL_R", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
425static MOP_MUL_R: ScMacroOp = ScMacroOp::new("MUL_R", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
426static MOP_MOV_RR: ScMacroOp =
427	ScMacroOp::new("MOV_RR", 3, 1, ExecutionPort::NULL, ExecutionPort::NULL);
428
429static MOP_LEA_SIB: ScMacroOp =
430	ScMacroOp::new("LEA_SIB", 4, 1, ExecutionPort::P01, ExecutionPort::NULL);
431static MOP_IMUL_RR_DEP: ScMacroOp =
432	ScMacroOp::new_dep("IMUL_RR_DEP", 4, 3, ExecutionPort::P1, ExecutionPort::NULL);
433static MOP_ROR_RI: ScMacroOp =
434	ScMacroOp::new("ROR_RI", 4, 1, ExecutionPort::P05, ExecutionPort::NULL);
435
436static MOP_ADD_RI: ScMacroOp =
437	ScMacroOp::new("ADD_RI", 7, 1, ExecutionPort::P015, ExecutionPort::NULL);
438static MOP_XOR_RI: ScMacroOp =
439	ScMacroOp::new("XOR_RI", 7, 1, ExecutionPort::P015, ExecutionPort::NULL);
440
441static MOP_MOV_RI64: ScMacroOp =
442	ScMacroOp::new("MOV_RI64", 10, 1, ExecutionPort::P015, ExecutionPort::NULL);
443
444static MOP_IMUL_RR: ScMacroOp =
445	ScMacroOp::new("IMUL_RR", 4, 3, ExecutionPort::P1, ExecutionPort::NULL);
446
447#[allow(nonstandard_style)]
448#[derive(Debug)]
449pub struct ScInstrInfo {
450	pub op: ScOpcode,
451	pub macro_ops: &'static [&'static ScMacroOp],
452	pub result_op: usize,
453	pub src_op: i32,
454	pub dst_op: i32,
455}
456
457impl ScInstrInfo {
458	pub const fn new(
459		op: ScOpcode,
460		macro_ops: &'static [&ScMacroOp],
461		result_op: usize,
462		dst_op: i32,
463		src_op: i32,
464	) -> ScInstrInfo {
465		ScInstrInfo {
466			op,
467			macro_ops,
468			result_op,
469			src_op,
470			dst_op,
471		}
472	}
473
474	pub fn size(&self) -> usize {
475		self.macro_ops.len()
476	}
477
478	pub fn macro_op(&self, i: usize) -> &'static ScMacroOp {
479		self.macro_ops[i]
480	}
481}
482
483static NOP: ScInstrInfo = ScInstrInfo::new(ScOpcode::INVALID, &[], 0, 0, 0);
484
485static ISUB_R: ScInstrInfo = ScInstrInfo::new(ScOpcode::ISUB_R, &[&MOP_SUB_RR], 0, 0, 0);
486static IXOR_R: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_R, &[&MOP_XOR_RR], 0, 0, 0);
487static IADD_RS: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_RS, &[&MOP_LEA_SIB], 0, 0, 0);
488static IMUL_R: ScInstrInfo = ScInstrInfo::new(ScOpcode::IMUL_R, &[&MOP_IMUL_RR], 0, 0, 0);
489static IROR_C: ScInstrInfo = ScInstrInfo::new(ScOpcode::IROR_C, &[&MOP_ROR_RI], 0, 0, -1);
490
491static IADD_C7: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_C7, &[&MOP_ADD_RI], 0, 0, -1);
492static IXOR_C7: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_C7, &[&MOP_XOR_RI], 0, 0, -1);
493static IADD_C8: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_C8, &[&MOP_ADD_RI], 0, 0, -1);
494static IXOR_C8: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_C8, &[&MOP_XOR_RI], 0, 0, -1);
495static IADD_C9: ScInstrInfo = ScInstrInfo::new(ScOpcode::IADD_C9, &[&MOP_ADD_RI], 0, 0, -1);
496static IXOR_C9: ScInstrInfo = ScInstrInfo::new(ScOpcode::IXOR_C9, &[&MOP_XOR_RI], 0, 0, -1);
497
498static IMULH_R: ScInstrInfo = ScInstrInfo::new(
499	ScOpcode::IMULH_R,
500	&[&MOP_MOV_RR, &MOP_MUL_R, &MOP_MOV_RR],
501	1,
502	0,
503	1,
504);
505static ISMULH_R: ScInstrInfo = ScInstrInfo::new(
506	ScOpcode::ISMULH_R,
507	&[&MOP_MOV_RR, &MOP_IMUL_R, &MOP_MOV_RR],
508	1,
509	0,
510	1,
511);
512static IMUL_RCP: ScInstrInfo = ScInstrInfo::new(
513	ScOpcode::IMUL_RCP,
514	&[&MOP_MOV_RI64, &MOP_IMUL_RR_DEP],
515	1,
516	1,
517	-1,
518);
519
520const BLAKE_GEN_DATA_LEN: usize = 64;
521pub struct Blake2Generator {
522	index: usize,
523	data: [u8; BLAKE_GEN_DATA_LEN],
524	gen_params: Params,
525}
526
527impl Blake2Generator {
528	pub fn new(seed: &[u8], nonce: u32) -> Blake2Generator {
529		debug_assert!(seed.len() <= BLAKE_GEN_DATA_LEN - 4);
530		let mut params = Params::new();
531		params.hash_length(BLAKE_GEN_DATA_LEN);
532
533		let mut key: [u8; 60] = [0; 60];
534		key[..seed.len()].copy_from_slice(seed);
535
536		let mut data: [u8; BLAKE_GEN_DATA_LEN] = [0; BLAKE_GEN_DATA_LEN];
537		data[..BLAKE_GEN_DATA_LEN - 4].copy_from_slice(&key);
538		data[BLAKE_GEN_DATA_LEN - 4..BLAKE_GEN_DATA_LEN].copy_from_slice(&nonce.to_le_bytes());
539
540		Blake2Generator {
541			index: BLAKE_GEN_DATA_LEN,
542			data,
543			gen_params: params,
544		}
545	}
546
547	pub fn get_byte(&mut self) -> u8 {
548		self.check_data(1);
549		let v = self.data[self.index];
550		self.index += 1;
551		v
552	}
553
554	pub fn get_u32(&mut self) -> u32 {
555		self.check_data(4);
556		let v = u32::from_le_bytes(self.data[self.index..(self.index + 4)].try_into().unwrap());
557		self.index += 4;
558		v
559	}
560	fn check_data(&mut self, needed: usize) {
561		if self.index + needed > BLAKE_GEN_DATA_LEN {
562			let out = self.gen_params.hash(&self.data);
563			self.data = *out.as_array();
564			self.index = 0;
565		}
566	}
567}
568
569pub struct DecoderBuffer {
570	index: u32,
571	counts: &'static [u32],
572}
573
574static BUFFER_484: DecoderBuffer = DecoderBuffer {
575	index: 0,
576	counts: &[4, 8, 4],
577};
578static BUFFER_7333: DecoderBuffer = DecoderBuffer {
579	index: 1,
580	counts: &[7, 3, 3, 3],
581};
582static BUFFER_3733: DecoderBuffer = DecoderBuffer {
583	index: 2,
584	counts: &[3, 7, 3, 3],
585};
586static BUFFER_493: DecoderBuffer = DecoderBuffer {
587	index: 3,
588	counts: &[4, 9, 3],
589};
590static BUFFER_4444: DecoderBuffer = DecoderBuffer {
591	index: 4,
592	counts: &[4, 4, 4, 4],
593};
594static BUFFFER_3310: DecoderBuffer = DecoderBuffer {
595	index: 5,
596	counts: &[3, 3, 10],
597};
598
599static DECODE_BUFFERS: [&DecoderBuffer; 4] = [&BUFFER_484, &BUFFER_7333, &BUFFER_3733, &BUFFER_493];
600
601impl DecoderBuffer {
602	fn initial() -> DecoderBuffer {
603		DecoderBuffer {
604			index: 0,
605			counts: &[],
606		}
607	}
608
609	pub fn size(&self) -> usize {
610		self.counts.len()
611	}
612
613	pub fn fetch_next(
614		&self,
615		instr: &ScInstr,
616		decode_cycle: usize,
617		mul_count: usize,
618		generator: &mut Blake2Generator,
619	) -> &'static DecoderBuffer {
620		if instr.info.op == ScOpcode::IMULH_R || instr.info.op == ScOpcode::ISMULH_R {
621			return &BUFFFER_3310;
622		}
623		if mul_count < decode_cycle + 1 {
624			return &BUFFER_4444;
625		}
626		if instr.info.op == ScOpcode::IMUL_RCP {
627			return if generator.get_byte() & 0x1 == 1 {
628				&BUFFER_484
629			} else {
630				&BUFFER_493
631			};
632		}
633		let ix = generator.get_byte();
634		DECODE_BUFFERS[(ix & 3) as usize]
635	}
636}
637
638pub struct ScProgram<'a> {
639	pub prog: Vec<ScInstr<'a>>,
640	pub asic_latencies: Vec<usize>,
641	pub cpu_latencies: Vec<usize>,
642	pub address_reg: usize,
643	pub ipc: f64,
644	pub code_size: usize,
645	pub macro_ops: usize,
646	pub decode_cycles: usize,
647	pub cpu_latency: usize,
648	pub asic_latency: usize,
649	pub mul_count: usize,
650}
651
652impl fmt::Display for ScProgram<'_> {
653	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
654		for instr in &self.prog {
655			writeln!(
656				f,
657				"op: {}, src: {}, dst: {}",
658				instr.info.op, instr.src, instr.dst
659			)
660			.unwrap();
661		}
662		Ok(())
663	}
664}
665
666impl ScProgram<'_> {
667	pub fn generate(generator: &mut Blake2Generator) -> ScProgram<'static> {
668		let mut prog = Vec::with_capacity(SUPERSCALAR_MAX_SIZE);
669
670		let mut port_busy = [[ExecutionPort::NULL; 3]; CYCLE_MAP_SIZE];
671		let mut registers = [RegisterInfo::new(); 8];
672
673		let mut macro_op_index = 0;
674		let mut code_size = 0;
675		let mut macro_op_count = 0;
676		let mut cycle = 0;
677		let mut dep_cycle = 0;
678		let mut retire_cycle = 0;
679		let mut ports_saturated = false;
680		let mut program_size = 0;
681		let mut mul_count = 0;
682		let mut decode_cycle = 0;
683		let mut throw_away_count = 0;
684
685		let mut decode_buffer = &DecoderBuffer::initial();
686		let mut current_instr = ScInstr::null();
687		while decode_cycle < RANDOMX_SUPERSCALAR_LATENCY
688			&& !ports_saturated
689			&& program_size < SUPERSCALAR_MAX_SIZE
690		{
691			decode_buffer = decode_buffer.fetch_next(&current_instr, decode_cycle, mul_count, generator);
692			let mut buffer_index = 0;
693			while buffer_index < decode_buffer.size() {
694				let top_cycle = cycle;
695				if macro_op_index >= current_instr.info.size() {
696					if ports_saturated || program_size >= SUPERSCALAR_MAX_SIZE {
697						break;
698					}
699
700					current_instr = ScInstr::create_for_slot(
701						generator,
702						decode_buffer.counts[buffer_index],
703						decode_buffer.index,
704						decode_buffer.size() == buffer_index + 1,
705					);
706					macro_op_index = 0
707				}
708
709				let mop = current_instr.info.macro_op(macro_op_index);
710				let schedule_cycle_mop = schedule_mop(false, mop, &mut port_busy, cycle, dep_cycle);
711				if schedule_cycle_mop.is_none() {
712					ports_saturated = true;
713					break;
714				}
715
716				let mut schedule_cycle = schedule_cycle_mop.unwrap();
717				if macro_op_index as i32 == current_instr.info.src_op {
718					let mut forward = 0;
719					while forward < LOOK_FORWARD_CYCLES
720						&& !current_instr.select_source(schedule_cycle, &registers, generator)
721					{
722						schedule_cycle += 1;
723						cycle += 1;
724						forward += 1;
725					}
726
727					if forward == LOOK_FORWARD_CYCLES {
728						if throw_away_count < MAX_THROWAWAY_COUNT {
729							throw_away_count += 1;
730							macro_op_index = current_instr.info.size();
731							continue;
732						}
733						current_instr = ScInstr::null();
734						break;
735					}
736				}
737				if macro_op_index as i32 == current_instr.info.dst_op {
738					let mut forward = 0;
739					while forward < LOOK_FORWARD_CYCLES
740						&& !current_instr.select_destination(
741							schedule_cycle,
742							throw_away_count > 0,
743							&registers,
744							generator,
745						) {
746						schedule_cycle += 1;
747						cycle += 1;
748						forward += 1;
749					}
750					if forward == LOOK_FORWARD_CYCLES {
751						if throw_away_count < MAX_THROWAWAY_COUNT {
752							throw_away_count += 1;
753							macro_op_index = current_instr.info.size();
754							continue;
755						}
756						current_instr = ScInstr::null();
757						break;
758					}
759				}
760				throw_away_count = 0;
761
762				let schedule_cycle_mop =
763					schedule_mop(true, mop, &mut port_busy, schedule_cycle, schedule_cycle);
764				if schedule_cycle_mop.is_none() {
765					ports_saturated = true;
766					break;
767				}
768				schedule_cycle = schedule_cycle_mop.unwrap();
769				dep_cycle = schedule_cycle + mop.latency;
770
771				if macro_op_index == current_instr.info.result_op {
772					let ri = &mut registers[current_instr.dst as usize];
773					retire_cycle = dep_cycle;
774					ri.latency = retire_cycle;
775					ri.last_op_group = current_instr.op_group;
776					ri.last_op_par = current_instr.op_group_par;
777				}
778				code_size += mop.size;
779				buffer_index += 1;
780				macro_op_index += 1;
781				macro_op_count += 1;
782
783				if schedule_cycle >= RANDOMX_SUPERSCALAR_LATENCY {
784					ports_saturated = true;
785				}
786				cycle = top_cycle;
787
788				if macro_op_index >= current_instr.info.size() {
789					if current_instr.info.op.is_multiplication() {
790						mul_count += 1;
791					}
792					prog.push(current_instr);
793					program_size += 1;
794				}
795			}
796			cycle += 1;
797			decode_cycle += 1;
798		}
799
800		let ipc = macro_op_count as f64 / retire_cycle as f64;
801		let mut asic_latencies = vec![0; 8];
802		for &instr in prog.iter().take(program_size) {
803			let lat_dst = asic_latencies[instr.dst as usize] + 1;
804			let lat_src = if instr.src < 0 || instr.src == instr.dst {
805				0
806			} else {
807				asic_latencies[instr.src as usize] + 1
808			};
809			asic_latencies[instr.dst as usize] = lat_dst.max(lat_src);
810		}
811
812		let mut asic_latency_max = 0;
813		let mut address_reg = 0;
814		let mut cpu_latencies = vec![0; 8];
815		for i in 0..8 {
816			if asic_latencies[i] > asic_latency_max {
817				asic_latency_max = asic_latencies[i];
818				address_reg = i;
819			}
820			cpu_latencies[i] = registers[i].latency;
821		}
822
823		ScProgram {
824			prog,
825			asic_latencies,
826			cpu_latencies,
827			address_reg,
828			ipc,
829			mul_count,
830			cpu_latency: retire_cycle,
831			asic_latency: asic_latency_max,
832			code_size,
833			macro_ops: macro_op_count,
834			decode_cycles: decode_cycle,
835		}
836	}
837
838	pub fn execute(&self, ds: &mut [u64; 8]) {
839		for instr in &self.prog {
840			let dst = instr.dst as usize;
841			let src = instr.src as usize;
842			match instr.info.op {
843				ScOpcode::ISUB_R => ds[dst] = ds[dst].wrapping_sub(ds[src]),
844				ScOpcode::IXOR_R => ds[dst] ^= ds[src],
845				ScOpcode::IADD_RS => ds[dst] = ds[dst].wrapping_add(ds[src] << instr.mod_shift()),
846				ScOpcode::IMUL_R => {
847					ds[dst] = ds[dst].wrapping_mul(ds[src]);
848				}
849				ScOpcode::IROR_C => ds[dst] = ds[dst].rotate_right(instr.imm32),
850				ScOpcode::IADD_C7 | ScOpcode::IADD_C8 | ScOpcode::IADD_C9 => {
851					ds[dst] = ds[dst].wrapping_add(u64_from_u32_imm(instr.imm32));
852				}
853				ScOpcode::IXOR_C7 | ScOpcode::IXOR_C8 | ScOpcode::IXOR_C9 => {
854					ds[dst] ^= u64_from_u32_imm(instr.imm32);
855				}
856				ScOpcode::IMULH_R => ds[dst] = mulh(ds[dst], ds[src]),
857				ScOpcode::ISMULH_R => ds[dst] = smulh(ds[dst], ds[src]),
858				ScOpcode::IMUL_RCP => {
859					ds[dst] = ds[dst].wrapping_mul(randomx_reciprocal(instr.imm32 as u64))
860				}
861				ScOpcode::COUNT => panic!("COUNT execution tried"),
862				ScOpcode::INVALID => panic!("INVALLID execution tried"),
863			}
864		}
865	}
866}
867
868#[allow(clippy::unnecessary_unwrap)]
869fn schedule_mop(
870	commit: bool,
871	mop: &ScMacroOp,
872	port_busy: &mut [[ExecutionPort; 3]; CYCLE_MAP_SIZE],
873	cycle_in: usize,
874	dep_cycle: usize,
875) -> Option<usize> {
876	let mut cycle = if mop.dependent {
877		usize::max(cycle_in, dep_cycle)
878	} else {
879		cycle_in
880	};
881
882	if mop.is_eliminated() {
883		return Some(cycle);
884	} else if mop.is_simple() {
885		return schedule_uop(commit, mop.uop1, port_busy, cycle);
886	} else {
887		while cycle < CYCLE_MAP_SIZE {
888			let cycle_1 = schedule_uop(false, mop.uop1, port_busy, cycle);
889			let cycle_2 = schedule_uop(false, mop.uop2, port_busy, cycle);
890
891			if cycle_1.is_some() && cycle_1 == cycle_2 {
892				if commit {
893					schedule_uop(true, mop.uop1, port_busy, cycle_1.unwrap());
894					schedule_uop(true, mop.uop2, port_busy, cycle_2.unwrap());
895				}
896				return cycle_1;
897			}
898			cycle += 1
899		}
900	}
901	None
902}
903
904fn schedule_uop(
905	commit: bool,
906	uop: ExecutionPort,
907	port_busy: &mut [[ExecutionPort; 3]; CYCLE_MAP_SIZE],
908	cycle_in: usize,
909) -> Option<usize> {
910	let mut cycle = cycle_in;
911	while cycle < CYCLE_MAP_SIZE {
912		if uop.is(ExecutionPort::P5) && port_busy[cycle][2] == ExecutionPort::NULL {
913			if commit {
914				port_busy[cycle][2] = uop;
915			}
916			return Some(cycle);
917		}
918		if uop.is(ExecutionPort::P0) && port_busy[cycle][0] == ExecutionPort::NULL {
919			if commit {
920				port_busy[cycle][0] = uop;
921			}
922			return Some(cycle);
923		}
924		if uop.is(ExecutionPort::P1) && port_busy[cycle][1] == ExecutionPort::NULL {
925			if commit {
926				port_busy[cycle][1] = uop;
927			}
928			return Some(cycle);
929		}
930		cycle += 1
931	}
932	None
933}