randomx4r-sys 0.1.1

FFI bindings for RandomX
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
/*
Copyright (c) 2018-2019, tevador <tevador@gmail.com>

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
	* Redistributions of source code must retain the above copyright
	  notice, this list of conditions and the following disclaimer.
	* Redistributions in binary form must reproduce the above copyright
	  notice, this list of conditions and the following disclaimer in the
	  documentation and/or other materials provided with the distribution.
	* Neither the name of the copyright holder nor the
	  names of its contributors may be used to endorse or promote products
	  derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "configuration.h"
#include "program.hpp"
#include "blake2/endian.h"
#include <iostream>
#include <vector>
#include <algorithm>
#include <stdexcept>
#include <iomanip>
#include "superscalar.hpp"
#include "intrin_portable.h"
#include "reciprocal.h"
#include "common.hpp"

namespace randomx {

	static bool isMultiplication(SuperscalarInstructionType type) {
		return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP;
	}

	//uOPs (micro-ops) are represented only by the execution port they can go to
	namespace ExecutionPort {
		using type = int;
		constexpr type Null = 0;
		constexpr type P0 = 1;
		constexpr type P1 = 2;
		constexpr type P5 = 4;
		constexpr type P01 = P0 | P1;
		constexpr type P05 = P0 | P5;
		constexpr type P015 = P0 | P1 | P5;
	}

	//Macro-operation as output of the x86 decoder
	//Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op
	//Macro-op can consist of 1 or 2 uOPs.
	class MacroOp {
	public:
		MacroOp(const char* name, int size)
			: name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {}
		MacroOp(const char* name, int size, int latency, ExecutionPort::type uop)
			: name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {}
		MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2)
			: name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {}
		MacroOp(const MacroOp& parent, bool dependent)
			: name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {}
		const char* getName() const {
			return name_;
		}
		int getSize() const {
			return size_;
		}
		int getLatency() const {
			return latency_;
		}
		ExecutionPort::type getUop1() const {
			return uop1_;
		}
		ExecutionPort::type getUop2() const {
			return uop2_;
		}
		bool isSimple() const {
			return uop2_ == ExecutionPort::Null;
		}
		bool isEliminated() const {
			return uop1_ == ExecutionPort::Null;
		}
		bool isDependent() const {
			return dependent_;
		}
		static const MacroOp Add_rr;
		static const MacroOp Add_ri;
		static const MacroOp Lea_sib;
		static const MacroOp Sub_rr;
		static const MacroOp Imul_rr;
		static const MacroOp Imul_r;
		static const MacroOp Mul_r;
		static const MacroOp Mov_rr;
		static const MacroOp Mov_ri64;
		static const MacroOp Xor_rr;
		static const MacroOp Xor_ri;
		static const MacroOp Ror_rcl;
		static const MacroOp Ror_ri;
		static const MacroOp TestJz_fused;
		static const MacroOp Xor_self;
		static const MacroOp Cmp_ri;
		static const MacroOp Setcc_r;
	private:
		const char* name_;
		int size_;
		int latency_;
		ExecutionPort::type uop1_;
		ExecutionPort::type uop2_;
		bool dependent_ = false;
	};

	//Size: 3 bytes
	const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
	const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5);
	const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3);

	//Size: 4 bytes
	const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01);
	const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1);
	const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05);

	//Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes)
	const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015);

	//Size: 10 bytes
	const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015);

	//Unused:
	const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5);
	const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3);
	const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015);
	const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05);
	const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5);

	const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr };
	const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr };
	const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) };

	class SuperscalarInstructionInfo {
	public:
		const char* getName() const {
			return name_;
		}
		int getSize() const {
			return ops_.size();
		}
		bool isSimple() const {
			return getSize() == 1;
		}
		int getLatency() const {
			return latency_;
		}
		const MacroOp& getOp(int index) const {
			return ops_[index];
		}
		SuperscalarInstructionType getType() const {
			return type_;
		}
		int getResultOp() const {
			return resultOp_;
		}
		int getDstOp() const {
			return dstOp_;
		}
		int getSrcOp() const {
			return srcOp_;
		}
		static const SuperscalarInstructionInfo ISUB_R;
		static const SuperscalarInstructionInfo IXOR_R;
		static const SuperscalarInstructionInfo IADD_RS;
		static const SuperscalarInstructionInfo IMUL_R;
		static const SuperscalarInstructionInfo IROR_C;
		static const SuperscalarInstructionInfo IADD_C7;
		static const SuperscalarInstructionInfo IXOR_C7;
		static const SuperscalarInstructionInfo IADD_C8;
		static const SuperscalarInstructionInfo IXOR_C8;
		static const SuperscalarInstructionInfo IADD_C9;
		static const SuperscalarInstructionInfo IXOR_C9;
		static const SuperscalarInstructionInfo IMULH_R;
		static const SuperscalarInstructionInfo ISMULH_R;
		static const SuperscalarInstructionInfo IMUL_RCP;
		static const SuperscalarInstructionInfo NOP;
	private:
		const char* name_;
		SuperscalarInstructionType type_;
		std::vector<MacroOp> ops_;
		int latency_;
		int resultOp_ = 0;
		int dstOp_ = 0;
		int srcOp_;

		SuperscalarInstructionInfo(const char* name)
			: name_(name), type_(SuperscalarInstructionType::INVALID), latency_(0) {}
		SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp& op, int srcOp)
			: name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) {
			ops_.push_back(MacroOp(op));
		}
		template <size_t N>
		SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp)
			: name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) {
			for (unsigned i = 0; i < N; ++i) {
				ops_.push_back(MacroOp(arr[i]));
				latency_ += ops_.back().getLatency();
			}
			static_assert(N > 1, "Invalid array size");
		}
	};

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1);

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1);

	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1);
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1);
	
	const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP");

	//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
	//RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate).
	//Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction.
	const int buffer0[] = { 4, 8, 4 };
	const int buffer1[] = { 7, 3, 3, 3 };
	const int buffer2[] = { 3, 7, 3, 3 };
	const int buffer3[] = { 4, 9, 3 };
	const int buffer4[] = { 4, 4, 4, 4 };
	const int buffer5[] = { 3, 3, 10 };

	class DecoderBuffer {
	public:
		static const DecoderBuffer Default;
		template <size_t N>
		DecoderBuffer(const char* name, int index, const int(&arr)[N])
			: name_(name), index_(index), counts_(arr), opsCount_(N) {}
		const int* getCounts() const {
			return counts_;
		}
		int getSize() const {
			return opsCount_;
		}
		int getIndex() const {
			return index_;
		}
		const char* getName() const {
			return name_;
		}
		const DecoderBuffer* fetchNext(SuperscalarInstructionType instrType, int cycle, int mulCount, Blake2Generator& gen) const {
			//If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10
			//because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs.
			//Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops.
			if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R)
				return &decodeBuffer3310;

			//To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications
			//is lower than the number of cycles.
			if (mulCount < cycle + 1)
				return &decodeBuffer4444;

			//If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication.
			if(instrType == SuperscalarInstructionType::IMUL_RCP)
				return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493;

			//Default: select a random fetch configuration.
			return fetchNextDefault(gen);
		}
	private:
		const char* name_;
		int index_;
		const int* counts_;
		int opsCount_;
		DecoderBuffer() : index_(-1) {}
		static const DecoderBuffer decodeBuffer484;
		static const DecoderBuffer decodeBuffer7333;
		static const DecoderBuffer decodeBuffer3733;
		static const DecoderBuffer decodeBuffer493;
		static const DecoderBuffer decodeBuffer4444;
		static const DecoderBuffer decodeBuffer3310;
		static const DecoderBuffer* decodeBuffers[4];
		const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const {
			return decodeBuffers[gen.getByte() & 3];
		}
	};

	const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0);
	const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1);
	const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2);
	const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3);
	const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4);
	const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5);

	const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = {
			&DecoderBuffer::decodeBuffer484,
			&DecoderBuffer::decodeBuffer7333,
			&DecoderBuffer::decodeBuffer3733,
			&DecoderBuffer::decodeBuffer493,
	};

	const DecoderBuffer DecoderBuffer::Default = DecoderBuffer();

	const SuperscalarInstructionInfo* slot_3[]  = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R };
	const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R };
	const SuperscalarInstructionInfo* slot_4[]  = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS };
	const SuperscalarInstructionInfo* slot_7[]  = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 };
	const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 };
	const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 };
	const SuperscalarInstructionInfo* slot_10   = &SuperscalarInstructionInfo::IMUL_RCP;

	static bool selectRegister(std::vector<int>& availableRegisters, Blake2Generator& gen, int& reg) {
		int index;
		if (availableRegisters.size() == 0)
			return false;

		if (availableRegisters.size() > 1) {
			index = gen.getUInt32() % availableRegisters.size();
		}
		else {
			index = 0;
		}
		reg = availableRegisters[index];
		return true;
	}

	class RegisterInfo {
	public:
		RegisterInfo() : latency(0), lastOpGroup(SuperscalarInstructionType::INVALID), lastOpPar(-1), value(0) {}
		int latency;
		SuperscalarInstructionType lastOpGroup;
		int lastOpPar;
		int value;
	};

	//"SuperscalarInstruction" consists of one or more macro-ops
	class SuperscalarInstruction {
	public:
		void toInstr(Instruction& instr) { //translate to a RandomX instruction format
			instr.opcode = (int)getType();
			instr.dst = dst_;
			instr.src = src_ >= 0 ? src_ : dst_;
			instr.setMod(mod_);
			instr.setImm32(imm32_);
		}

		void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) {
			switch (slotSize)
			{
			case 3:
				//if this is the last slot, we can also select "IMULH" instructions
				if (isLast) {
					create(slot_3L[gen.getByte() & 3], gen);
				}
				else {
					create(slot_3[gen.getByte() & 1], gen);
				}
				break;
			case 4:
				//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
				if (fetchType == 4 && !isLast) {
					create(&SuperscalarInstructionInfo::IMUL_R, gen);
				}
				else {
					create(slot_4[gen.getByte() & 1], gen);
				}
				break;
			case 7:
				create(slot_7[gen.getByte() & 1], gen);
				break;
			case 8:
				create(slot_8[gen.getByte() & 1], gen);
				break;
			case 9:
				create(slot_9[gen.getByte() & 1], gen);
				break;
			case 10:
				create(slot_10, gen);
				break;
			default:
				UNREACHABLE;
			}
		}

		void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) {
			info_ = info;
			reset();
			switch (info->getType())
			{
			case SuperscalarInstructionType::ISUB_R: {
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IADD_RS;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IXOR_R: {
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IXOR_R;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IADD_RS: {
				mod_ = gen.getByte();
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IADD_RS;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IMUL_R: {
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IMUL_R;
				groupParIsSource_ = true;
			} break;

			case SuperscalarInstructionType::IROR_C: {
				mod_ = 0;
				do {
					imm32_ = gen.getByte() & 63;
				} while (imm32_ == 0);
				opGroup_ = SuperscalarInstructionType::IROR_C;
				opGroupPar_ = -1;
			} break;

			case SuperscalarInstructionType::IADD_C7:
			case SuperscalarInstructionType::IADD_C8:
			case SuperscalarInstructionType::IADD_C9: {
				mod_ = 0;
				imm32_ = gen.getUInt32();
				opGroup_ = SuperscalarInstructionType::IADD_C7;
				opGroupPar_ = -1;
			} break;

			case SuperscalarInstructionType::IXOR_C7:
			case SuperscalarInstructionType::IXOR_C8:
			case SuperscalarInstructionType::IXOR_C9: {
				mod_ = 0;
				imm32_ = gen.getUInt32();
				opGroup_ = SuperscalarInstructionType::IXOR_C7;
				opGroupPar_ = -1;
			} break;

			case SuperscalarInstructionType::IMULH_R: {
				canReuse_ = true;
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::IMULH_R;
				opGroupPar_ = gen.getUInt32();
			} break;

			case SuperscalarInstructionType::ISMULH_R: {
				canReuse_ = true;
				mod_ = 0;
				imm32_ = 0;
				opGroup_ = SuperscalarInstructionType::ISMULH_R;
				opGroupPar_ = gen.getUInt32();
			} break;

			case SuperscalarInstructionType::IMUL_RCP: {
				mod_ = 0;
				do {
					imm32_ = gen.getUInt32();
				} while (isZeroOrPowerOf2(imm32_));
				opGroup_ = SuperscalarInstructionType::IMUL_RCP;
				opGroupPar_ = -1;
			} break;

			default:
				break;
			}
		}

		bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (&registers)[8], Blake2Generator& gen) {
			/*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R)
				std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/
			std::vector<int> availableRegisters;
			//Conditions for the destination register:
			// * value must be ready at the required cycle
			// * cannot be the same as the source register unless the instruction allows it
			//   - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
			// * register cannot be multiplied twice in a row unless allowChainedMul is true 
			//   - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
			//   - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
			// * either the last instruction applied to the register or its source must be different than this instruction
			//   - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2"
			// * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
			for (unsigned i = 0; i < 8; ++i) {
				if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement))
					availableRegisters.push_back(i);
			}
			return selectRegister(availableRegisters, gen, dst_);
		}

		bool selectSource(int cycle, RegisterInfo(&registers)[8], Blake2Generator& gen) {
			std::vector<int> availableRegisters;
			//all registers that are ready at the cycle
			for (unsigned i = 0; i < 8; ++i) {
				if (registers[i].latency <= cycle)
					availableRegisters.push_back(i);
			}
			//if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination
			if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) {
				if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) {
					opGroupPar_ = src_ = RegisterNeedsDisplacement;
					return true;
				}
			}
			if (selectRegister(availableRegisters, gen, src_)) {
				if (groupParIsSource_)
					opGroupPar_ = src_;
				return true;
			}
			return false;
		}

		SuperscalarInstructionType getType() {
			return info_->getType();
		}
		int getSource() {
			return src_;
		}
		int getDestination() {
			return dst_;
		}
		SuperscalarInstructionType getGroup() {
			return opGroup_;
		}
		int getGroupPar() {
			return opGroupPar_;
		}

		const SuperscalarInstructionInfo& getInfo() const {
			return *info_;
		}

		static const SuperscalarInstruction Null;

	private:
		const SuperscalarInstructionInfo* info_;
		int src_ = -1;
		int dst_ = -1;
		int mod_;
		uint32_t imm32_;
		SuperscalarInstructionType opGroup_;
		int opGroupPar_;
		bool canReuse_ = false;
		bool groupParIsSource_ = false;

		void reset() {
			src_ = dst_ = -1;
			canReuse_ = groupParIsSource_ = false;
		}

		SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) {
		}
	};

	const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP);

	constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4;
	constexpr int LOOK_FORWARD_CYCLES = 4;
	constexpr int MAX_THROWAWAY_COUNT = 256;

	template<bool commit>
	static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) {
		//The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload
		//port P1 (multiplication) by instructions that can go to any port.
		for (; cycle < CYCLE_MAP_SIZE; ++cycle) {
			if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) {
				if (commit) {
					if (trace) std::cout << "; P5 at cycle " << cycle << std::endl;
					portBusy[cycle][2] = uop;
				}
				return cycle;
			}
			if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) {
				if (commit) {
					if (trace) std::cout << "; P0 at cycle " << cycle << std::endl;
					portBusy[cycle][0] = uop;
				}
				return cycle;
			}
			if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) {
				if (commit) {
					if (trace) std::cout << "; P1 at cycle " << cycle << std::endl;
					portBusy[cycle][1] = uop;
				}
				return cycle;
			}
		}
		return -1;
	}

	template<bool commit>
	static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) {
		//if this macro-op depends on the previous one, increase the starting cycle if needed
		//this handles an explicit dependency chain in IMUL_RCP
		if (mop.isDependent()) {
			cycle = std::max(cycle, depCycle);
		}
		//move instructions are eliminated and don't need an execution unit
		if (mop.isEliminated()) {
			if (commit)
				if (trace) std::cout << "; (eliminated)" << std::endl;
			return cycle;
		} 
		else if (mop.isSimple()) {
			//this macro-op has only one uOP
			return scheduleUop<commit>(mop.getUop1(), portBusy, cycle);
		}
		else {
			//macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle
			for (; cycle < CYCLE_MAP_SIZE; ++cycle) {

				int cycle1 = scheduleUop<false>(mop.getUop1(), portBusy, cycle);
				int cycle2 = scheduleUop<false>(mop.getUop2(), portBusy, cycle);

				if (cycle1 >= 0 && cycle1 == cycle2) {
					if (commit) {
						scheduleUop<true>(mop.getUop1(), portBusy, cycle1);
						scheduleUop<true>(mop.getUop2(), portBusy, cycle2);
					}
					return cycle1;
				}
			}
		}

		return -1;
	}

	void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) {

		ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3];
		memset(portBusy, 0, sizeof(portBusy));
		RegisterInfo registers[8];

		const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default;
		SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null;
		int macroOpIndex = 0;
		int codeSize = 0;
		int macroOpCount = 0;
		int cycle = 0;
		int depCycle = 0;
		int retireCycle = 0;
		bool portsSaturated = false;
		int programSize = 0;
		int mulCount = 0;
		int decodeCycle;
		int throwAwayCount = 0;

		//decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated.
		//Each decode cycle decodes 16 bytes of x86 code.
		//Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always
		//saturated first. The cycle limit is present only to guarantee loop termination.
		//Program size is limited to SuperscalarMaxSize instructions.
		for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < SuperscalarMaxSize; ++decodeCycle) {

			//select a decode configuration
			decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen);
			if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl;

			int bufferIndex = 0;
			
			//fill all instruction slots in the current decode buffer
			while (bufferIndex < decodeBuffer->getSize()) {
				int topCycle = cycle;

				//if we have issued all macro-ops for the current RandomX instruction, create a new instruction
				if (macroOpIndex >= currentInstruction.getInfo().getSize()) {
					if (portsSaturated || programSize >= SuperscalarMaxSize)
						break;
					//select an instruction so that the first macro-op fits into the current slot
					currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0);
					macroOpIndex = 0;
					if (trace) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl;
				}
				const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex);
				if (trace) std::cout << mop.getName() << " ";

				//calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution
				int scheduleCycle = scheduleMop<false>(mop, portBusy, cycle, depCycle);
				if (scheduleCycle < 0) {
					if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl;
					//__debugbreak();
					portsSaturated = true;
					break;
				}

				//find a source register (if applicable) that will be ready when this instruction executes
				if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) {
					int forward;
					//if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward
					for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) {
						if (trace) std::cout << "; src STALL at cycle " << cycle << std::endl;
						++scheduleCycle;
						++cycle;
					}
					//if no register was found, throw the instruction away and try another one
					if (forward == LOOK_FORWARD_CYCLES) {
						if (throwAwayCount < MAX_THROWAWAY_COUNT) {
							throwAwayCount++;
							macroOpIndex = currentInstruction.getInfo().getSize();
							if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
							//cycle = topCycle;
							continue;
						}
						//abort this decode buffer
						if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl;
						currentInstruction = SuperscalarInstruction::Null;
						break;
					}
					if (trace) std::cout << "; src = r" << currentInstruction.getSource() << std::endl;
				}
				//find a destination register that will be ready when this instruction executes
				if (macroOpIndex == currentInstruction.getInfo().getDstOp()) {
					int forward;
					for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) {
						if (trace) std::cout << "; dst STALL at cycle " << cycle << std::endl;
						++scheduleCycle;
						++cycle;
					}
					if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away
						if (throwAwayCount < MAX_THROWAWAY_COUNT) {
							throwAwayCount++;
							macroOpIndex = currentInstruction.getInfo().getSize();
							if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
							//cycle = topCycle;
							continue;
						}
						//abort this decode buffer
						if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl;
						currentInstruction = SuperscalarInstruction::Null;
						break;
					}
					if (trace) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl;
				}
				throwAwayCount = 0;

				//recalculate when the instruction can be scheduled for execution based on operand availability
				scheduleCycle = scheduleMop<true>(mop, portBusy, scheduleCycle, scheduleCycle);

				if (scheduleCycle < 0) {
					if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << scheduleCycle << ")" << std::endl;
					portsSaturated = true;
					break;
				}

				//calculate when the result will be ready
				depCycle = scheduleCycle + mop.getLatency();

				//if this instruction writes the result, modify register information
				//  RegisterInfo.latency - which cycle the register will be ready
				//  RegisterInfo.lastOpGroup - the last operation that was applied to the register
				//  RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register)
				if (macroOpIndex == currentInstruction.getInfo().getResultOp()) {
					int dst = currentInstruction.getDestination();
					RegisterInfo& ri = registers[dst];
					retireCycle = depCycle;
					ri.latency = retireCycle;
					ri.lastOpGroup = currentInstruction.getGroup();
					ri.lastOpPar = currentInstruction.getGroupPar();
					if (trace) std::cout << "; RETIRED at cycle " << retireCycle << std::endl;
				}
				codeSize += mop.getSize();
				bufferIndex++;
				macroOpIndex++;
				macroOpCount++;

				//terminating condition
				if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) {
					portsSaturated = true;
				}
				cycle = topCycle;

				//when all macro-ops of the current instruction have been issued, add the instruction into the program
				if (macroOpIndex >= currentInstruction.getInfo().getSize()) {
					currentInstruction.toInstr(prog(programSize++));
					mulCount += isMultiplication(currentInstruction.getType());
				}
			}
			++cycle;
		}

		double ipc = (macroOpCount / (double)retireCycle);

		memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies));

		//Calculate ASIC latency:
		//Assumes 1 cycle latency for all operations and unlimited parallelization.
		for (int i = 0; i < programSize; ++i) {
			Instruction& instr = prog(i);
			int latDst = prog.asicLatencies[instr.dst] + 1;
			int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0;
			prog.asicLatencies[instr.dst] = std::max(latDst, latSrc);
		}

		//address register is the register with the highest ASIC latency
		int asicLatencyMax = 0;
		int addressReg = 0;
		for (int i = 0; i < 8; ++i) {
			if (prog.asicLatencies[i] > asicLatencyMax) {
				asicLatencyMax = prog.asicLatencies[i];
				addressReg = i;
			}
			prog.cpuLatencies[i] = registers[i].latency;
		}

		prog.setSize(programSize);
		prog.setAddressRegister(addressReg);

		prog.cpuLatency = retireCycle;
		prog.asicLatency = asicLatencyMax;
		prog.codeSize = codeSize;
		prog.macroOps = macroOpCount;
		prog.decodeCycles = decodeCycle;
		prog.ipc = ipc;
		prog.mulCount = mulCount;
		

		/*if(INFO) std::cout << "; ALU port utilization:" << std::endl;
		if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl;

		int portCycles = 0;
		for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
			std::cout << "; " << std::setw(3) << i << " ";
			for (int j = 0; j < 3; ++j) {
				std::cout << (portBusy[i][j] ? '*' : '_');
				portCycles += !!portBusy[i][j];
			}
			std::cout << std::endl;
		}*/
	}

	void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t> *reciprocals) {
		for (unsigned j = 0; j < prog.getSize(); ++j) {
			Instruction& instr = prog(j);
			switch ((SuperscalarInstructionType)instr.opcode)
			{
			case SuperscalarInstructionType::ISUB_R:
				r[instr.dst] -= r[instr.src];
				break;
			case SuperscalarInstructionType::IXOR_R:
				r[instr.dst] ^= r[instr.src];
				break;
			case SuperscalarInstructionType::IADD_RS:
				r[instr.dst] += r[instr.src] << instr.getModShift();
				break;
			case SuperscalarInstructionType::IMUL_R:
				r[instr.dst] *= r[instr.src];
				break;
			case SuperscalarInstructionType::IROR_C:
				r[instr.dst] = rotr(r[instr.dst], instr.getImm32());
				break;
			case SuperscalarInstructionType::IADD_C7:
			case SuperscalarInstructionType::IADD_C8:
			case SuperscalarInstructionType::IADD_C9:
				r[instr.dst] += signExtend2sCompl(instr.getImm32());
				break;
			case SuperscalarInstructionType::IXOR_C7:
			case SuperscalarInstructionType::IXOR_C8:
			case SuperscalarInstructionType::IXOR_C9:
				r[instr.dst] ^= signExtend2sCompl(instr.getImm32());
				break;
			case SuperscalarInstructionType::IMULH_R:
				r[instr.dst] = mulh(r[instr.dst], r[instr.src]);
				break;
			case SuperscalarInstructionType::ISMULH_R:
				r[instr.dst] = smulh(r[instr.dst], r[instr.src]);
				break;
			case SuperscalarInstructionType::IMUL_RCP:
				if (reciprocals != nullptr)
					r[instr.dst] *= (*reciprocals)[instr.getImm32()];
				else
					r[instr.dst] *= randomx_reciprocal(instr.getImm32());
				break;
			default:
				UNREACHABLE;
			}
		}
	}
}