quantrs2-circuit 0.2.0

Quantum circuit representation and DSL for the QuantRS2 framework
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
//! Solovay-Kitaev Algorithm for single-qubit gate approximation.
//!
//! Implements the Solovay-Kitaev theorem (Dawson & Nielsen, quant-ph/0505030):
//! any single-qubit SU(2) gate can be approximated to precision ε using
//! O(log^c(1/ε)) gates from the universal set {H, T, T†, S, S†}.
//!
//! S = T² and S† = T†² are algebraically expressible in terms of T and T†, but
//! enumerating them as separate BFS atoms shortens many useful sequences (e.g.
//! Y-axis-rotation conjugates such as S·H·RZ(θ)·H·S†) at the cost of a denser
//! basic table.
//!
//! # Design
//!
//! Rather than storing SU(2) in the `[[a,-b*],[b,a*]]` parameterization (which
//! forces H into a non-standard embedding), we store the full 2×2 unitary matrix.
//! Gate multiplication, adjoint and distance are computed directly.  This avoids
//! the representation issues with H (det = −1) while keeping the struct name SU2.
//!
//! Frobenius distance is computed **up to global phase**:
//!   d(U,V) = sqrt(2 − |Tr(U†V)|)
//! which is the correct metric for SK's approximation quality.

use quantrs2_core::error::{QuantRS2Error, QuantRS2Result};
use scirs2_core::Complex64;
use std::collections::HashMap;
use std::f64::consts::{PI, SQRT_2};

/// 2×2 single-qubit unitary matrix stored in row-major order:
///   [[m00, m01], [m10, m11]]
///
/// No assumption is made about the specific SU(2) parameterization;
/// the matrix may represent any U(2) unitary (global phase is tracked
/// but ignored when measuring approximation quality).
#[derive(Debug, Clone, Copy)]
pub struct SU2 {
    pub m00: Complex64,
    pub m01: Complex64,
    pub m10: Complex64,
    pub m11: Complex64,
}

impl SU2 {
    /// Create a new SU2 matrix from four complex entries.
    pub fn new(m00: Complex64, m01: Complex64, m10: Complex64, m11: Complex64) -> Self {
        Self { m00, m01, m10, m11 }
    }

    /// Identity gate.
    pub fn identity() -> Self {
        Self {
            m00: Complex64::new(1.0, 0.0),
            m01: Complex64::new(0.0, 0.0),
            m10: Complex64::new(0.0, 0.0),
            m11: Complex64::new(1.0, 0.0),
        }
    }

    /// Construct an SU2 matrix from a named gate in {H, T, Tdg, S, Sdg, X, Y, Z}.
    ///
    /// Returns `None` for unrecognised gate names.
    pub fn from_gate_name(name: &str) -> Option<Self> {
        let inv_sqrt2 = 1.0 / SQRT_2;
        let t_phase = Complex64::new(inv_sqrt2, inv_sqrt2); // e^{iπ/4}
        let tdg_phase = Complex64::new(inv_sqrt2, -inv_sqrt2); // e^{-iπ/4}

        match name {
            "H" => Some(Self {
                m00: Complex64::new(inv_sqrt2, 0.0),
                m01: Complex64::new(inv_sqrt2, 0.0),
                m10: Complex64::new(inv_sqrt2, 0.0),
                m11: Complex64::new(-inv_sqrt2, 0.0),
            }),
            "T" => Some(Self {
                m00: Complex64::new(1.0, 0.0),
                m01: Complex64::new(0.0, 0.0),
                m10: Complex64::new(0.0, 0.0),
                m11: t_phase,
            }),
            "Tdg" => Some(Self {
                m00: Complex64::new(1.0, 0.0),
                m01: Complex64::new(0.0, 0.0),
                m10: Complex64::new(0.0, 0.0),
                m11: tdg_phase,
            }),
            "S" => Some(Self {
                m00: Complex64::new(1.0, 0.0),
                m01: Complex64::new(0.0, 0.0),
                m10: Complex64::new(0.0, 0.0),
                m11: Complex64::new(0.0, 1.0),
            }),
            "Sdg" => Some(Self {
                m00: Complex64::new(1.0, 0.0),
                m01: Complex64::new(0.0, 0.0),
                m10: Complex64::new(0.0, 0.0),
                m11: Complex64::new(0.0, -1.0),
            }),
            "X" => Some(Self {
                m00: Complex64::new(0.0, 0.0),
                m01: Complex64::new(1.0, 0.0),
                m10: Complex64::new(1.0, 0.0),
                m11: Complex64::new(0.0, 0.0),
            }),
            "Y" => Some(Self {
                m00: Complex64::new(0.0, 0.0),
                m01: Complex64::new(0.0, -1.0),
                m10: Complex64::new(0.0, 1.0),
                m11: Complex64::new(0.0, 0.0),
            }),
            "Z" => Some(Self {
                m00: Complex64::new(1.0, 0.0),
                m01: Complex64::new(0.0, 0.0),
                m10: Complex64::new(0.0, 0.0),
                m11: Complex64::new(-1.0, 0.0),
            }),
            _ => None,
        }
    }

    /// Matrix multiplication: self * other.
    pub fn mul(&self, other: &SU2) -> SU2 {
        SU2 {
            m00: self.m00 * other.m00 + self.m01 * other.m10,
            m01: self.m00 * other.m01 + self.m01 * other.m11,
            m10: self.m10 * other.m00 + self.m11 * other.m10,
            m11: self.m10 * other.m01 + self.m11 * other.m11,
        }
    }

    /// Conjugate transpose (adjoint / Hermitian conjugate).
    pub fn adjoint(&self) -> SU2 {
        SU2 {
            m00: self.m00.conj(),
            m01: self.m10.conj(),
            m10: self.m01.conj(),
            m11: self.m11.conj(),
        }
    }

    /// Trace of the matrix.
    pub fn trace(&self) -> Complex64 {
        self.m00 + self.m11
    }

    /// Frobenius distance from identity, up to global phase:
    ///   d(U, I) = sqrt(2 − |Tr(U)|)
    ///
    /// This equals zero only when U is a global phase times identity.
    /// Used for table lookups (comparing approximations to targets).
    pub fn distance_from_identity(&self) -> f64 {
        let tr_abs = self.trace().norm();
        let val = (2.0 - tr_abs).max(0.0);
        val.sqrt()
    }

    /// Frobenius distance between two matrices, up to global phase:
    ///   d(U, V) = sqrt(2 − |Tr(U†V)|)
    ///
    /// Used for evaluating approximation quality (the output metric).
    pub fn frobenius_distance(&self, other: &SU2) -> f64 {
        let udagger_v = self.adjoint().mul(other);
        udagger_v.distance_from_identity()
    }

    /// Frobenius distance from identity WITHOUT global phase correction:
    ///   d(U, I) = sqrt(2 − Re[Tr(U)])
    ///
    /// This is the metric used internally by the Solovay-Kitaev recursion.
    /// It reflects the actual matrix distance (not up to phase), so that
    /// the residual delta = U * u_prev† has the correct geometric meaning.
    pub fn distance_from_identity_exact(&self) -> f64 {
        let re_tr = self.trace().re;
        let val = (2.0 - re_tr).max(0.0);
        val.sqrt()
    }

    /// Frobenius distance between two matrices WITHOUT global phase correction:
    ///   d(U, V) = sqrt(2 − Re[Tr(U†V)])
    pub fn frobenius_distance_exact(&self, other: &SU2) -> f64 {
        let udagger_v = self.adjoint().mul(other);
        udagger_v.distance_from_identity_exact()
    }

    /// Rotation angle θ (in radians) such that U = exp(iθ/2 n̂·σ) * global_phase.
    ///
    /// Computed as θ = 2 * arccos(|Tr(U)| / 2) using the absolute trace,
    /// which factors out the global phase and gives the physical rotation angle.
    ///
    /// For the SU(2) representative R (with det = 1), Tr(R) = 2 cos(θ/2) is real.
    /// For a general U(2) matrix U = e^{iφ} R, |Tr(U)| = |Tr(R)| = 2|cos(θ/2)|.
    pub fn rotation_angle(&self) -> f64 {
        // Factor out global phase: |Tr(U)| = |2 cos(θ/2)| gives the physical angle.
        let tr_abs = self.trace().norm();
        let cos_half = (tr_abs / 2.0).clamp(0.0, 1.0);
        2.0 * cos_half.acos()
    }

    /// Normalized rotation axis [nx, ny, nz] for the SU(2) rotation.
    ///
    /// Returns [0, 0, 1] (z-axis) if the rotation angle is near zero.
    pub fn rotation_axis(&self) -> [f64; 3] {
        let theta = self.rotation_angle();
        let s = (theta / 2.0).sin();

        if s.abs() < 1e-12 {
            return [0.0, 0.0, 1.0];
        }

        // For U = exp(iθ/2 n̂·σ) in SU(2), U = cos(θ/2)I + i sin(θ/2) n̂·σ
        // n̂·σ = [[nz, nx-iny],[nx+iny, -nz]]
        // Extract components from the off-diagonal imaginary / diagonal imaginary
        // Handle global phase: normalize by global phase factor.
        let det = self.m00 * self.m11 - self.m01 * self.m10;
        let global_phase_half = det.arg() / 2.0;
        let phase_inv = Complex64::new((-global_phase_half).cos(), (-global_phase_half).sin());

        let a00 = self.m00 * phase_inv;
        let a01 = self.m01 * phase_inv;
        let a10 = self.m10 * phase_inv;
        let a11 = self.m11 * phase_inv;

        // From SU(2) standard form: a00 = cos(θ/2) + i*nz*sin(θ/2)
        //                            a01 = i*(nx - i*ny)*... no, let's re-derive
        // Standard SU(2): U = [[c + i*nz*s, i*nx*s + ny*s],
        //                       [i*nx*s - ny*s, c - i*nz*s]]
        // where c = cos(θ/2), s = sin(θ/2)
        let nz = a00.im / s;
        let nx = a10.im / s;
        let ny = a10.re / s;

        let _ = (a01, a11); // suppress unused warnings

        let norm = (nx * nx + ny * ny + nz * nz).sqrt();
        if norm < 1e-12 {
            [0.0, 0.0, 1.0]
        } else {
            [nx / norm, ny / norm, nz / norm]
        }
    }

    /// Construct a rotation about the X-axis: exp(+i*angle/2 * X).
    /// Matrix: [[cos(angle/2), i*sin(angle/2)], [i*sin(angle/2), cos(angle/2)]]
    pub fn rotation_x(angle: f64) -> SU2 {
        let c = (angle / 2.0).cos();
        let s = (angle / 2.0).sin();
        SU2 {
            m00: Complex64::new(c, 0.0),
            m01: Complex64::new(0.0, s),
            m10: Complex64::new(0.0, s),
            m11: Complex64::new(c, 0.0),
        }
    }

    /// Construct a rotation about the Y-axis: exp(+i*angle/2 * Y).
    /// Matrix: [[cos(angle/2), sin(angle/2)], [-sin(angle/2), cos(angle/2)]]
    pub fn rotation_y(angle: f64) -> SU2 {
        let c = (angle / 2.0).cos();
        let s = (angle / 2.0).sin();
        SU2 {
            m00: Complex64::new(c, 0.0),
            m01: Complex64::new(s, 0.0),
            m10: Complex64::new(-s, 0.0),
            m11: Complex64::new(c, 0.0),
        }
    }

    /// Construct a rotation about the Z-axis: exp(+i*angle/2 * Z).
    /// Matrix: [[e^{i*angle/2}, 0], [0, e^{-i*angle/2}]]
    pub fn rotation_z(angle: f64) -> SU2 {
        let half = angle / 2.0;
        SU2 {
            m00: Complex64::new(half.cos(), half.sin()),
            m01: Complex64::new(0.0, 0.0),
            m10: Complex64::new(0.0, 0.0),
            m11: Complex64::new(half.cos(), -half.sin()),
        }
    }
}

/// Compute SU(2) rotation about arbitrary unit axis [nx, ny, nz] by angle theta.
///
/// U = cos(θ/2) I + i sin(θ/2) (nx X + ny Y + nz Z)
///
/// Matrix form (standard SU(2)):
///   [[cos(θ/2) + i*nz*sin(θ/2),  (ny + i*nx)*sin(θ/2)],
///    [(-ny + i*nx)*sin(θ/2),       cos(θ/2) - i*nz*sin(θ/2)]]
pub fn rotation_about_axis(nx: f64, ny: f64, nz: f64, theta: f64) -> SU2 {
    let c = (theta / 2.0).cos();
    let s = (theta / 2.0).sin();
    SU2 {
        m00: Complex64::new(c, nz * s),
        m01: Complex64::new(ny * s, nx * s),
        m10: Complex64::new(-ny * s, nx * s),
        m11: Complex64::new(c, -nz * s),
    }
}

/// Decompose U into a balanced commutator V * W * V† * W† ≈ U.
///
/// This works when U is close to identity (small rotation angle).
/// Based on the Dawson-Nielsen algorithm Section IV.A.
///
/// Given U = exp(iθ n̂·σ/2), we find φ such that sin²(2φ) = sin(θ/2),
/// i.e. 2φ = arcsin(√sin(θ/2)), from the leading-order BCH relation
///   VWV†W† ≈ exp(i·4φ²·n̂·σ)  with 4φ² ≈ θ for small θ.
///
/// Then V = rotation about e₂ by 2φ, W = rotation about e₁ by 2φ,
/// where {e₁, e₂, n̂} form an orthonormal triad with e₂ = n̂ × e₁.
///
/// The group commutator VWV†W† has rotation axis +n̂ (not −n̂), because
/// the Lie-algebra commutator [e₂·σ, e₁·σ] = 2i(e₂×e₁)·σ = 2i(−n̂)·σ
/// and the group-commutator first-order term picks up an additional minus,
/// giving net axis +n̂ matching delta.
///
/// # Convergence note
///
/// The perpendicular-axis BCH commutator matches the rotation **angle** of
/// delta to leading order in θ but has O(φ) residual **axis** drift at finite
/// rotation angles.  In practice, the fallback guard in `sk_recurse` means
/// recursion depth > 1 rarely gives strict improvement over depth 1: the
/// commutator construction removes one layer of error but the axis drift
/// floors the residual near the depth-1 result.  Depth 1 provides the
/// practical O(ε^{3/2}) improvement; deeper depths are bounded above
/// (never worse due to the fallback) but not guaranteed to be strictly better.
///
/// Returns (V, W) such that V*W*V†*W† ≈ U.
pub fn balanced_commutator_decompose(u: &SU2) -> QuantRS2Result<(SU2, SU2)> {
    let theta = u.rotation_angle();

    // BCH formula: sin²(2φ) = sin(θ/2), i.e. 2φ = arcsin(√sin(θ/2))
    // For small θ: 4φ² ≈ θ, matching the rotation angle of the commutator.
    let sin_half_theta = (theta / 2.0).sin().abs();

    let sin_sq_phi_arg = sin_half_theta.sqrt();
    // Clamp due to floating point
    let phi = if sin_sq_phi_arg >= 1.0 {
        PI / 4.0
    } else {
        sin_sq_phi_arg.asin() / 2.0
    };

    let n_hat = u.rotation_axis();
    let (nx, ny, nz) = (n_hat[0], n_hat[1], n_hat[2]);

    // Find e1 perpendicular to n̂
    // If n̂ is not parallel to ẑ, use e1 = normalize(n̂ × ẑ)
    // Otherwise use e1 = x̂
    let (e1x, e1y, e1z) = if (nz.abs() - 1.0).abs() > 1e-6 {
        // n̂ × ẑ = (ny*1 - nz*0, nz*0 - nx*1, nx*0 - ny*0) = (ny, -nx, 0)
        let cx = ny;
        let cy = -nx;
        let cz = 0.0_f64;
        let norm = (cx * cx + cy * cy + cz * cz).sqrt();
        if norm < 1e-12 {
            (1.0_f64, 0.0_f64, 0.0_f64)
        } else {
            (cx / norm, cy / norm, cz / norm)
        }
    } else {
        (1.0_f64, 0.0_f64, 0.0_f64)
    };

    // e2 = n̂ × e1
    let e2x = ny * e1z - nz * e1y;
    let e2y = nz * e1x - nx * e1z;
    let e2z = nx * e1y - ny * e1x;

    // V = rotation about e2 by 2*phi, W = rotation about e1 by 2*phi.
    // The group commutator V·W·V†·W† ≈ exp(-2iφ²(e2×e1)·σ).
    // For our setup: e2 = n̂×e1, so e2×e1 = (n̂×e1)×e1 = -n̂ (by BAC-CAB rule).
    // Thus commutator ≈ exp(-2iφ²(-n̂)·σ) = exp(2iφ²n̂·σ) = rotation about n̂ by 4φ².
    // With sin²(2φ) = sin(θ/2), for small θ: 4φ² ≈ θ, giving the approximation.
    let v = rotation_about_axis(e2x, e2y, e2z, 2.0 * phi);
    // W = rotation about e1 by 2*phi
    let w = rotation_about_axis(e1x, e1y, e1z, 2.0 * phi);

    Ok((v, w))
}

/// Look up table of gate sequences for the universal gate set {H, T, Tdg, S, Sdg}.
///
/// **Two-stage Clifford+T construction.**  The single-qubit Clifford group
/// modulo global phase has only 24 elements, so a naïve BFS over the 5-atom
/// alphabet quickly collapses every branch onto that 24-element coset and
/// stalls.  To get richer SU(2) coverage we decouple the order-24 Clifford
/// subgroup from the T-count growth:
///
/// 1. **Stage 1 — Clifford enumeration.**  BFS over {H, S, Sdg} until
///    saturation produces exactly 24 distinct unitaries (up to global phase).
///    Each carries a sequence of length ≤ 6 — the Clifford diameter for this
///    generator set.
///
/// 2. **Stage 2 — T-count strata.**  For each `k = 1 … max_t_count`, build
///    `S_k` from `S_{k-1}` by appending `T` or `Tdg`, then a Clifford from
///    Stage 1.  Every entry in `S_k` therefore has T-count exactly `k`,
///    sandwiched between Cliffords.  Global de-duplication across all strata
///    via the spatial-hash grid keeps the table compact.
///
/// The resulting table covers the {H,T} normal form up to T-count
/// `max_t_count`, with Cliffords absorbed at no extra T-count.  Each
/// additional stratum genuinely expands the SU(2) reach (the Clifford+T
/// group is dense) rather than re-deriving cosets the previous depth
/// already covered.
pub struct BasicApproximationTable {
    /// (gate_sequence, SU2_matrix) pairs. Pub for test introspection.
    pub entries: Vec<(Vec<&'static str>, SU2)>,
}

/// Convert an SU2 matrix to a canonical SU(2) representative for hashing.
///
/// A U(2) matrix M = e^{iφ} R where R ∈ SU(2). We factor out the global
/// phase by computing φ = arg(det(M)) / 2 and then R = e^{-iφ} M.
/// The canonical SU(2) element R has det(R) = 1 exactly.
///
/// To break the remaining ±I ambiguity (since R and -R have the same distance
/// from identity), we ensure that the component with the largest magnitude is
/// positive. This gives a unique canonical quaternion (q0, q1, q2, q3) for
/// each physical gate up to global phase.
///
/// Returns (q0, q1, q2, q3) where:
///   q0 = Re(R.m00), q1 = Im(R.m00), q2 = Re(R.m10), q3 = Im(R.m10)
fn su2_to_canonical_quat(m: &SU2) -> (f64, f64, f64, f64) {
    // Factor out global phase: det = m00*m11 - m01*m10, det = e^{2iφ}
    let det = m.m00 * m.m11 - m.m01 * m.m10;
    let phase_angle = det.arg() / 2.0; // φ = arg(det)/2
    let phase_inv = Complex64::new((-phase_angle).cos(), (-phase_angle).sin()); // e^{-iφ}
                                                                                // R = e^{-iφ} * M  → det(R) = e^{-2iφ} * e^{2iφ} = 1
    let r00 = m.m00 * phase_inv;
    let r10 = m.m10 * phase_inv;

    let (q0, q1, q2, q3) = (r00.re, r00.im, r10.re, r10.im);

    // SU(2) double cover: R and -R represent the same physical gate.
    // Canonicalize by ensuring the largest-magnitude component is positive.
    let max_abs = q0.abs().max(q1.abs()).max(q2.abs()).max(q3.abs());
    let sign = if (q0.abs() - max_abs).abs() < 1e-10 {
        if q0 >= 0.0 {
            1.0_f64
        } else {
            -1.0_f64
        }
    } else if (q1.abs() - max_abs).abs() < 1e-10 {
        if q1 >= 0.0 {
            1.0_f64
        } else {
            -1.0_f64
        }
    } else if (q2.abs() - max_abs).abs() < 1e-10 {
        if q2 >= 0.0 {
            1.0_f64
        } else {
            -1.0_f64
        }
    } else {
        if q3 >= 0.0 {
            1.0_f64
        } else {
            -1.0_f64
        }
    };

    (sign * q0, sign * q1, sign * q2, sign * q3)
}

/// Grid cell key for the spatial hash (deduplication grid).
fn quat_to_cell(q: (f64, f64, f64, f64), cell_size: f64) -> (i64, i64, i64, i64) {
    let cell = |x: f64| (x / cell_size).floor() as i64;
    (cell(q.0), cell(q.1), cell(q.2), cell(q.3))
}

/// Dedup tolerance: two SU(2) matrices are considered identical if their
/// up-to-global-phase Frobenius distance is below this threshold.
///
/// Direct construction (e.g. `SU2::rotation_z(π/2)` for S) versus accumulated
/// multiplication of, say, two T-gates can differ by ~1.5e-8 due to argument
/// reduction differences inside the trigonometric primitives. `1e-6` absorbs
/// that margin while remaining 4 orders of magnitude smaller than the typical
/// inter-sequence spacing on SU(2). A value below ~1e-7 causes the BFS to
/// bloat with near-duplicates (e.g. T·T failing to collapse onto S).
const DEDUP_TOL: f64 = 1e-6;

/// Grid cell size for the spatial hash.  Must be larger than `DEDUP_TOL` (in
/// quaternion coordinates) so near-duplicates land in adjacent cells and are
/// caught by the 16-cell neighbour sweep performed by `DedupTable::contains`.
const CELL_SIZE: f64 = 0.002;

/// Hash-grid–backed deduplication table for SU(2) elements.
///
/// Wraps a `Vec<(seq, matrix)>` plus a 4-D quaternion-grid hash so that
/// `is_new` runs in O(neighbours) rather than O(N).  Used by both Stage 1
/// (Clifford enumeration) and Stage 2 (T-count strata) so dedup is global.
struct DedupTable {
    entries: Vec<(Vec<&'static str>, SU2)>,
    grid: HashMap<(i64, i64, i64, i64), Vec<usize>>,
}

impl DedupTable {
    fn new() -> Self {
        Self {
            entries: Vec::new(),
            grid: HashMap::new(),
        }
    }

    /// Returns `true` if `mat` is already in the table (within `DEDUP_TOL`).
    fn contains(&self, mat: &SU2) -> bool {
        let q = su2_to_canonical_quat(mat);
        let (cx, cy, cz, cw) = quat_to_cell(q, CELL_SIZE);
        // Check the 3^4 = 81 neighbouring cells (adjacent in each of 4 dims).
        for dx in -1_i64..=1 {
            for dy in -1_i64..=1 {
                for dz in -1_i64..=1 {
                    for dw in -1_i64..=1 {
                        let cell = (cx + dx, cy + dy, cz + dz, cw + dw);
                        if let Some(indices) = self.grid.get(&cell) {
                            for &idx in indices {
                                let (_, ref existing_mat) = self.entries[idx];
                                if existing_mat.frobenius_distance(mat) < DEDUP_TOL {
                                    return true;
                                }
                            }
                        }
                    }
                }
            }
        }
        false
    }

    /// Insert (`seq`, `mat`) and register it in the spatial hash. Caller must
    /// have already established that `mat` is not a duplicate.
    fn insert(&mut self, seq: Vec<&'static str>, mat: SU2) {
        let idx = self.entries.len();
        let q = su2_to_canonical_quat(&mat);
        let cell = quat_to_cell(q, CELL_SIZE);
        self.grid.entry(cell).or_default().push(idx);
        self.entries.push((seq, mat));
    }
}

/// Enumerate the order-24 single-qubit Clifford group as `(matrix, sequence)`
/// pairs over the generator set {H, S, Sdg}.
///
/// The Clifford group modulo global phase is finite with order 24, generated
/// by H and S. BFS until saturation (queue empties) reaches every element in
/// at most 6 generator-applications.
///
/// Returns exactly 24 entries; the caller may assert this invariant.
fn enumerate_cliffords() -> Vec<(Vec<&'static str>, SU2)> {
    let h_gate = SU2::from_gate_name("H").unwrap_or_else(SU2::identity);
    let s_gate = SU2::from_gate_name("S").unwrap_or_else(SU2::identity);
    let sdg_gate = SU2::from_gate_name("Sdg").unwrap_or_else(SU2::identity);
    let generators: &[(&'static str, &SU2)] = &[("H", &h_gate), ("S", &s_gate), ("Sdg", &sdg_gate)];

    let mut table = DedupTable::new();
    table.insert(Vec::new(), SU2::identity());

    let mut queue: std::collections::VecDeque<usize> = std::collections::VecDeque::new();
    queue.push_back(0);

    while let Some(idx) = queue.pop_front() {
        let (seq, mat) = {
            let (s, m) = &table.entries[idx];
            (s.clone(), *m)
        };
        for (gen_name, gen_mat) in generators {
            let candidate_mat = mat.mul(gen_mat);
            if !table.contains(&candidate_mat) {
                let mut candidate_seq = Vec::with_capacity(seq.len() + 1);
                candidate_seq.extend_from_slice(&seq);
                candidate_seq.push(gen_name);
                let new_idx = table.entries.len();
                table.insert(candidate_seq, candidate_mat);
                queue.push_back(new_idx);
            }
        }
    }

    table.entries
}

impl BasicApproximationTable {
    /// Build the basic approximation table via the two-stage Clifford+T BFS.
    ///
    /// Stage 1 enumerates the 24-element single-qubit Clifford group from
    /// {H, S, Sdg}.  Stage 2 builds T-count strata `S_1, …, S_K` where each
    /// `S_k` extends `S_{k-1}` by `T` or `Tdg` followed by an arbitrary
    /// Clifford. Global dedup keeps the table compact while every stratum
    /// genuinely extends SU(2) coverage (the Clifford+T group is dense).
    ///
    /// `table_depth` is **re-interpreted as the maximum T-count** for the
    /// new algorithm (it was the BFS depth previously). Internally capped at
    /// `MAX_T_COUNT_LIMIT` to avoid combinatorial explosion at large values
    /// — a `table_depth` of 10 collapses to the same internal cap.
    pub fn build(table_depth: usize) -> Self {
        // Internal cap on the maximum T-count.  Stratum size grows roughly
        // 2× per level once the Clifford-T mod-phase quotient saturates.
        // We pick the smallest level that gives strict RY(0.3) coverage
        // (i.e. some entry beats the identity bound on small Y rotations).
        const MAX_T_COUNT_LIMIT: usize = 10;
        let max_t_count = std::cmp::min(table_depth, MAX_T_COUNT_LIMIT);

        let t_gate = SU2::from_gate_name("T").unwrap_or_else(SU2::identity);
        let tdg_gate = SU2::from_gate_name("Tdg").unwrap_or_else(SU2::identity);
        let t_atoms: &[(&'static str, &SU2)] = &[("T", &t_gate), ("Tdg", &tdg_gate)];

        // ---- Stage 1: enumerate the 24-element Clifford group ----
        let cliffords = enumerate_cliffords();
        debug_assert_eq!(
            cliffords.len(),
            24,
            "single-qubit Clifford group should have 24 elements modulo global phase"
        );

        // ---- Stage 2: T-count strata, all hashed into a global dedup table ----
        let mut table = DedupTable::new();
        for (seq, mat) in &cliffords {
            table.insert(seq.clone(), *mat);
        }
        // `prev_stratum` carries indices into `table.entries` for the previous
        // stratum so we extend only those (without re-extending earlier strata).
        let mut prev_stratum: Vec<usize> = (0..cliffords.len()).collect();

        for _k in 1..=max_t_count {
            let mut next_stratum: Vec<usize> = Vec::new();
            for &prev_idx in &prev_stratum {
                // Snapshot the previous-stratum entry so subsequent inserts
                // (which may grow `table.entries`) don't invalidate the borrow.
                let (prev_seq, prev_mat) = {
                    let (s, m) = &table.entries[prev_idx];
                    (s.clone(), *m)
                };
                for (t_name, t_mat) in t_atoms {
                    // Hoist e_mat * t_atom out of the inner Clifford loop.
                    let after_t = prev_mat.mul(t_mat);
                    for (c_seq, c_mat) in &cliffords {
                        let candidate_mat = after_t.mul(c_mat);
                        if !table.contains(&candidate_mat) {
                            // Build the sequence only once we know it's new
                            // — saves up to 48× allocation per kept entry.
                            let mut candidate_seq =
                                Vec::with_capacity(prev_seq.len() + 1 + c_seq.len());
                            candidate_seq.extend_from_slice(&prev_seq);
                            candidate_seq.push(t_name);
                            candidate_seq.extend_from_slice(c_seq);
                            let new_idx = table.entries.len();
                            table.insert(candidate_seq, candidate_mat);
                            next_stratum.push(new_idx);
                        }
                    }
                }
            }
            if next_stratum.is_empty() {
                break;
            }
            prev_stratum = next_stratum;
        }

        Self {
            entries: table.entries,
        }
    }

    /// Find the sequence in the table closest to `target` (up-to-global-phase distance).
    ///
    /// Returns the slice of gate names and the distance achieved.
    /// O(N) linear scan over the entire table.
    pub fn find_closest<'a>(&'a self, target: &SU2) -> (&'a [&'static str], f64) {
        let mut best_dist = f64::INFINITY;
        let mut best_seq: &[&'static str] = &[];

        for (seq, mat) in &self.entries {
            let dist = mat.frobenius_distance(target);
            if dist < best_dist {
                best_dist = dist;
                best_seq = seq.as_slice();
            }
        }

        (best_seq, best_dist)
    }

    /// Number of entries in the table.
    pub fn len(&self) -> usize {
        self.entries.len()
    }
}

/// Convert a named adjoint gate.
fn adjoint_gate(gate: &'static str) -> &'static str {
    match gate {
        "T" => "Tdg",
        "Tdg" => "T",
        "S" => "Sdg",
        "Sdg" => "S",
        "H" => "H",
        "X" => "X",
        "Y" => "Y",
        "Z" => "Z",
        other => other,
    }
}

/// Compute the SU2 matrix product of a gate sequence.
///
/// Sequences are applied left-to-right: the first gate in the slice is applied first.
pub fn sequence_to_matrix(seq: &[&'static str]) -> SU2 {
    seq.iter().fold(SU2::identity(), |acc, g| {
        let gate = SU2::from_gate_name(g).unwrap_or(SU2::identity());
        acc.mul(&gate)
    })
}

/// Solovay-Kitaev decomposer.
///
/// Approximates any single-qubit SU(2) gate to precision ε using sequences
/// from {H, T, Tdg, S, Sdg}.
///
/// # Convergence behaviour
///
/// The internal `balanced_commutator_decompose` uses a BCH-derived formula
/// that matches the rotation *angle* of the residual delta to leading order in
/// the rotation angle θ, but has an O(φ) axis-drift at finite θ.  As a result:
///
/// - **Depth 0**: pure table lookup — precision is limited by table density.
/// - **Depth 1**: provides meaningful O(ε^{3/2}) improvement over depth 0 for
///   targets where the table is not already near-optimal.
/// - **Depth ≥ 2**: the fallback guard (which never returns a worse result)
///   means deeper recursion is bounded above by the depth-1 result but is
///   rarely strictly better.  The axis drift of the commutator construction
///   constitutes a floor that prevents further BCH-based reduction.
///
/// In practice, use `recursion_depth = 1` for the best trade-off between
/// sequence length and approximation quality.  Depths 2–4 are safe (they never
/// regress) but provide negligible additional improvement with this gate set.
///
/// # Example
///
/// ```rust
/// use quantrs2_circuit::solovay_kitaev::{SU2, SOKDecomposer, sequence_to_matrix};
///
/// let target = SU2::rotation_z(0.5);
/// let decomposer = SOKDecomposer::new(10, 1);
/// let seq = decomposer.decompose(&target).expect("decomposition failed");
/// let approx = sequence_to_matrix(&seq);
/// let dist = approx.frobenius_distance(&target);
/// assert!(dist < 0.15, "SK approximation distance: {dist}");
/// ```
pub struct SOKDecomposer {
    table: BasicApproximationTable,
    /// Number of SK recursion levels (0 = table lookup only; 1 = best practical depth).
    ///
    /// Depth 1 provides meaningful improvement over depth 0.  Depths ≥ 2 are
    /// bounded above by the depth-1 result and rarely strictly better due to the
    /// axis-drift floor in the BCH commutator construction (see struct doc).
    pub recursion_depth: usize,
}

impl SOKDecomposer {
    /// Build the decomposer with a precomputed basic approximation table.
    ///
    /// `table_depth`: maximum T-count for the basic table (re-interpreted from
    ///   the previous BFS-depth meaning).  See [`BasicApproximationTable::build`]
    ///   for the two-stage Clifford+T construction.  The table internally caps
    ///   the value to keep build time bounded; passing `10` is fine — it will
    ///   collapse to the internal cap.  Smaller values (0–3) give sparser
    ///   coverage and may not be dense enough for the BCH commutator
    ///   construction to improve the result at recursion depth 1.
    ///
    /// `recursion_depth`: number of SK recursion levels (0 = table only).
    ///   Use 1 for the best improvement/gate-count trade-off.  Depth 1 gives
    ///   O(ε^{3/2}) improvement at a cost of ~4× more gates.  Depths ≥ 2 are
    ///   safe (never worse due to the fallback guard) but rarely strictly better
    ///   with this commutator construction — see struct-level doc for details.
    pub fn new(table_depth: usize, recursion_depth: usize) -> Self {
        Self {
            table: BasicApproximationTable::build(table_depth),
            recursion_depth,
        }
    }

    /// Decompose `u` into a sequence of gate names from {H, T, Tdg, S, Sdg}.
    pub fn decompose(&self, u: &SU2) -> QuantRS2Result<Vec<&'static str>> {
        self.sk_recurse(u, self.recursion_depth)
    }

    /// Internal recursive Solovay-Kitaev routine.
    fn sk_recurse(&self, u: &SU2, n: usize) -> QuantRS2Result<Vec<&'static str>> {
        if n == 0 {
            let (seq, _dist) = self.table.find_closest(u);
            return Ok(seq.to_vec());
        }

        // Obtain U_{n-1}: best approximation using n-1 recursion levels
        let u_prev_seq = self.sk_recurse(u, n - 1)?;
        let u_prev = sequence_to_matrix(&u_prev_seq);
        // Use up-to-phase distance for quality comparison (the reported metric)
        let prev_dist = u_prev.frobenius_distance(u);

        // Compute delta = U * u_prev† in a phase-aligned way.
        //
        // Since u_prev is found using the up-to-phase metric, the matrix u_prev
        // may differ from U by a global phase. We align phases before computing
        // the residual so that delta has Re[Tr] maximized (closest to identity
        // in the Dawson-Nielsen metric), enabling accurate commutator construction.
        //
        // Optimal alignment: multiply u_prev by e^{iφ} where φ = -arg(Tr(U * u_prev†))/2
        // This makes Tr(delta) real and positive.
        let raw_delta = u.mul(&u_prev.adjoint());
        let tr_delta = raw_delta.trace();
        let phase_angle = -tr_delta.arg() / 2.0; // Negative half the trace argument
        let phase = Complex64::new(phase_angle.cos(), phase_angle.sin());
        // Apply phase to u_prev: u_prev_phased = e^{iφ} * u_prev
        let u_prev_phased = SU2 {
            m00: u_prev.m00 * phase,
            m01: u_prev.m01 * phase,
            m10: u_prev.m10 * phase,
            m11: u_prev.m11 * phase,
        };
        // delta_phased = U * u_prev_phased† = U * e^{-iφ} * u_prev†
        // This has Re[Tr(delta_phased)] maximized = |Tr(raw_delta)|
        let delta = u.mul(&u_prev_phased.adjoint());

        // Decompose delta as a balanced commutator: delta ≈ V * W * V† * W†
        let (v, w) = balanced_commutator_decompose(&delta)?;

        // Recursively approximate V and W to depth n-1
        let v_seq = self.sk_recurse(&v, n - 1)?;
        let w_seq = self.sk_recurse(&w, n - 1)?;

        // V† sequence: reverse and conjugate each gate
        let v_adj_seq: Vec<&'static str> = v_seq.iter().rev().map(|g| adjoint_gate(g)).collect();
        let w_adj_seq: Vec<&'static str> = w_seq.iter().rev().map(|g| adjoint_gate(g)).collect();

        // Final sequence: V_n W_n V_n† W_n† U_{n-1}
        let mut result = v_seq;
        result.extend_from_slice(&w_seq);
        result.extend(v_adj_seq);
        result.extend(w_adj_seq);
        result.extend_from_slice(&u_prev_seq);

        // Guard: if the new result is worse than the previous approximation,
        // fall back to u_prev_seq. The commutator decomposition may introduce
        // more error than it corrects when the basic table is not dense enough.
        let new_mat = sequence_to_matrix(&result);
        let new_dist = new_mat.frobenius_distance(u);
        if new_dist >= prev_dist {
            return Ok(u_prev_seq);
        }

        Ok(result)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Tolerance for floating-point comparisons in basic unitary tests.
    /// Using 1e-6 to accommodate accumulated floating-point rounding.
    const EPS: f64 = 1e-6;

    #[test]
    fn test_su2_identity_distance() {
        let i = SU2::identity();
        assert!(
            i.distance_from_identity() < EPS,
            "identity distance should be ~0, got {}",
            i.distance_from_identity()
        );
    }

    #[test]
    fn test_su2_multiplication_hh_is_identity() {
        let h = SU2::from_gate_name("H").expect("H gate must exist");
        let hh = h.mul(&h);
        assert!(
            hh.frobenius_distance(&SU2::identity()) < EPS,
            "H*H should equal Identity (up to global phase), got {}",
            hh.frobenius_distance(&SU2::identity())
        );
    }

    #[test]
    fn test_su2_tt_is_s() {
        let t = SU2::from_gate_name("T").expect("T gate must exist");
        let s = SU2::from_gate_name("S").expect("S gate must exist");
        let tt = t.mul(&t);
        assert!(
            tt.frobenius_distance(&s) < EPS,
            "T*T should equal S, got {}",
            tt.frobenius_distance(&s)
        );
    }

    #[test]
    fn test_adjoint_t_tdg() {
        let t = SU2::from_gate_name("T").expect("T gate must exist");
        let tdg = SU2::from_gate_name("Tdg").expect("Tdg gate must exist");
        assert!(
            t.adjoint().frobenius_distance(&tdg) < EPS,
            "T† should equal Tdg, dist = {}",
            t.adjoint().frobenius_distance(&tdg)
        );
    }

    #[test]
    fn test_rotation_z_composition() {
        let rz1 = SU2::rotation_z(0.3);
        let rz2 = SU2::rotation_z(0.7);
        let combined = rz1.mul(&rz2);
        let expected = SU2::rotation_z(1.0);
        assert!(
            combined.frobenius_distance(&expected) < EPS,
            "RZ(0.3)*RZ(0.7) should equal RZ(1.0)"
        );
    }

    #[test]
    fn test_basic_approximation_table_size() {
        let table = BasicApproximationTable::build(5);
        assert!(
            table.entries.len() > 10,
            "Table should have many entries, got {}",
            table.entries.len()
        );
    }

    #[test]
    fn test_basic_approximation_table_finds_identity() {
        let table = BasicApproximationTable::build(5);
        let identity = SU2::identity();
        let (_seq, dist) = table.find_closest(&identity);
        assert!(
            dist < 0.1,
            "Should find approximation close to identity, dist = {}",
            dist
        );
    }

    #[test]
    fn test_balanced_commutator_small_rotation() {
        let small_u = SU2::rotation_z(0.1);
        let (v, w) =
            balanced_commutator_decompose(&small_u).expect("commutator decompose should succeed");
        let commutator = v.mul(&w).mul(&v.adjoint()).mul(&w.adjoint());
        let dist = commutator.frobenius_distance(&small_u);
        assert!(
            dist < 0.2,
            "Balanced commutator should approximate small rotation U, got dist={}",
            dist
        );
    }

    #[test]
    fn test_balanced_commutator_identity() {
        // Identity has zero rotation → commutator of identity matrices = identity
        let u = SU2::identity();
        let (v, w) =
            balanced_commutator_decompose(&u).expect("commutator decompose should succeed");
        let commutator = v.mul(&w).mul(&v.adjoint()).mul(&w.adjoint());
        let dist = commutator.frobenius_distance(&u);
        assert!(
            dist < 0.1,
            "Commutator decompose of identity should be identity, got dist={}",
            dist
        );
    }

    #[test]
    fn test_solovay_kitaev_rz_depth0() {
        // Depth=0 is pure table lookup — should find something within 0.5 of target.
        let target = SU2::rotation_z(0.5);
        let decomposer = SOKDecomposer::new(10, 0);
        let seq = decomposer.decompose(&target).expect("decompose failed");
        let approx = sequence_to_matrix(&seq);
        let dist = approx.frobenius_distance(&target);
        // Table lookup: with depth=10, expect dist < 0.2 for most targets
        assert!(
            dist < 0.5,
            "Depth-0 should be within 0.5 of target, got {dist}"
        );
    }

    #[test]
    fn test_solovay_kitaev_rz_depth1() {
        // Approximate RZ(0.5) using SK at depth=1 with the two-stage Clifford+T table
        // (table_depth=10 → max_t_count=10, ~73k entries).
        // SK at depth 1 gives O(ε^{3/2}) improvement over depth=0.
        // Empirically: depth-0 dist ≈ 0.044, depth-1 dist ≈ 0.037.
        let target = SU2::rotation_z(0.5);
        let decomposer = SOKDecomposer::new(10, 1);
        let seq = decomposer.decompose(&target).expect("decompose failed");

        let approx = sequence_to_matrix(&seq);
        let dist = approx.frobenius_distance(&target);
        let depth0_dist = {
            let d0 = SOKDecomposer::new(10, 0);
            let s0 = d0.decompose(&target).expect("depth0 failed");
            sequence_to_matrix(&s0).frobenius_distance(&target)
        };
        // SK depth=1 must strictly improve over depth=0
        assert!(
            dist < depth0_dist,
            "SK depth=1 ({dist:.6}) should improve over depth=0 ({depth0_dist:.6})"
        );
        // Absolute bound: with the dense Clifford+T table, depth-1 reaches ~0.037
        assert!(
            dist < 0.05,
            "SK depth=1 should achieve dist < 0.05, got {dist:.6}"
        );
        assert!(!seq.is_empty(), "Sequence should not be empty");
    }

    #[test]
    fn test_solovay_kitaev_rz_depth2() {
        // At depth 2, SK falls back to the depth-1 result due to the axis-drift
        // floor in the BCH commutator construction (see SOKDecomposer doc).
        // The fallback guard guarantees depth-2 is never worse than depth-1.
        let target = SU2::rotation_z(0.5);
        let decomposer2 = SOKDecomposer::new(10, 2);
        let seq2 = decomposer2.decompose(&target).expect("decompose failed");
        let dist2 = sequence_to_matrix(&seq2).frobenius_distance(&target);

        let decomposer1 = SOKDecomposer::new(10, 1);
        let seq1 = decomposer1.decompose(&target).expect("depth1 failed");
        let dist1 = sequence_to_matrix(&seq1).frobenius_distance(&target);

        // Depth=2 must be at least as good as depth=1 (fallback guard ensures this).
        // Strict improvement is not guaranteed with BCH commutator at this gate set density.
        assert!(
            dist2 <= dist1 + 1e-9,
            "SK depth=2 ({dist2:.6}) should not be worse than depth=1 ({dist1:.6})"
        );
    }

    #[test]
    fn test_solovay_kitaev_convergence_debug() {
        // Check convergence at each depth to understand algorithm behavior.
        // Uses table_depth=10 (~73k entries via the two-stage Clifford+T BFS) for
        // dense coverage.
        let target = SU2::rotation_z(0.5);
        let table = BasicApproximationTable::build(10);

        eprintln!("Table size: {}", table.entries.len());

        // Depth 0: table lookup — should find a good starting approximation
        let (seq0, dist0) = table.find_closest(&target);
        eprintln!("Depth 0: dist={:.6}, seq_len={}", dist0, seq0.len());

        // Print the top 5 closest entries in the table to RZ(0.5)
        let mut dists: Vec<(f64, usize)> = table
            .entries
            .iter()
            .enumerate()
            .map(|(i, (_, mat))| (mat.frobenius_distance(&target), i))
            .collect();
        dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
        eprintln!("Top-5 closest table entries to RZ(0.5):");
        for (dist, idx) in dists.iter().take(5) {
            eprintln!("  dist={:.6}, seq={:?}", dist, table.entries[*idx].0);
        }

        // Debug the commutator construction with phase-alignment (as in sk_recurse)
        let u_prev = sequence_to_matrix(seq0);
        let raw_delta = target.mul(&u_prev.adjoint());
        let tr_delta = raw_delta.trace();
        let phase_angle = -tr_delta.arg() / 2.0;
        let phase = Complex64::new(phase_angle.cos(), phase_angle.sin());
        let u_prev_phased = SU2 {
            m00: u_prev.m00 * phase,
            m01: u_prev.m01 * phase,
            m10: u_prev.m10 * phase,
            m11: u_prev.m11 * phase,
        };
        let delta = target.mul(&u_prev_phased.adjoint());
        let delta_dist = delta.distance_from_identity();
        eprintln!("Phase-aligned delta dist from identity: {:.6}", delta_dist);
        eprintln!(
            "Phase-aligned delta rotation angle: {:.6}",
            delta.rotation_angle()
        );

        let (v, w) = balanced_commutator_decompose(&delta).expect("commutator failed");
        let (v_seq, v_dist) = table.find_closest(&v);
        let (w_seq, w_dist) = table.find_closest(&w);
        eprintln!("V approx dist: {:.6}, W approx dist: {:.6}", v_dist, w_dist);

        // Compute the commutator using exact v and w
        let exact_comm = v.mul(&w).mul(&v.adjoint()).mul(&w.adjoint());
        let exact_comm_dist = exact_comm.frobenius_distance(&delta);
        eprintln!("Exact commutator dist from delta: {:.6}", exact_comm_dist);

        // Compute the commutator using approximated v and w
        let v_mat = sequence_to_matrix(v_seq);
        let w_mat = sequence_to_matrix(w_seq);
        let approx_comm = v_mat
            .mul(&w_mat)
            .mul(&v_mat.adjoint())
            .mul(&w_mat.adjoint());
        let approx_comm_to_delta = approx_comm.frobenius_distance(&delta);
        let approx_comm_to_i = approx_comm.distance_from_identity();
        eprintln!(
            "Approx commutator dist from delta: {:.6}",
            approx_comm_to_delta
        );
        eprintln!("Approx commutator dist from I: {:.6}", approx_comm_to_i);

        // Full depth=1 result: V_approx * W_approx * V_approx† * W_approx† * u_prev
        let v_adj_seq: Vec<&'static str> = v_seq.iter().rev().map(|g| adjoint_gate(g)).collect();
        let w_adj_seq: Vec<&'static str> = w_seq.iter().rev().map(|g| adjoint_gate(g)).collect();
        let mut full_seq = v_seq.to_vec();
        full_seq.extend_from_slice(w_seq);
        full_seq.extend(v_adj_seq);
        full_seq.extend(w_adj_seq);
        full_seq.extend_from_slice(seq0);
        let full_mat = sequence_to_matrix(&full_seq);
        let full_dist = full_mat.frobenius_distance(&target);
        eprintln!("Full depth=1 result dist from target: {:.6}", full_dist);

        // Using the full decomposer at depth=1
        let decomposer = SOKDecomposer::new(10, 1);
        let sk_seq = decomposer.decompose(&target).expect("SK decompose failed");
        let sk_mat = sequence_to_matrix(&sk_seq);
        let sk_dist = sk_mat.frobenius_distance(&target);
        eprintln!("SK depth=1 dist: {:.6}, seq_len={}", sk_dist, sk_seq.len());

        assert!(dist0 < 0.5, "depth-0 should find something reasonable");
    }

    #[test]
    fn test_solovay_kitaev_rz_depth3() {
        // At depth 3, SK is monotone (fallback guard ensures never-worse).
        // The BCH commutator axis-drift floors the result near the depth-1 value;
        // strict improvement over depth-1 is not guaranteed with this gate set.
        // With table_depth=10 (~73k entries), depth-1 achieves dist ≈ 0.037.
        let target = SU2::rotation_z(0.5);
        let d1 = SOKDecomposer::new(10, 1)
            .decompose(&target)
            .expect("depth1 failed");
        let dist1 = sequence_to_matrix(&d1).frobenius_distance(&target);

        let d3 = SOKDecomposer::new(10, 3)
            .decompose(&target)
            .expect("depth3 failed");
        let dist3 = sequence_to_matrix(&d3).frobenius_distance(&target);

        // Depth=3 must be at least as good as depth=1 (SK is monotone by construction).
        // Strict improvement is not guaranteed — depth-3 typically equals depth-1.
        assert!(
            dist3 <= dist1 + 1e-9,
            "SK depth=3 ({dist3:.6}) should not be worse than depth=1 ({dist1:.6})"
        );
        assert!(!d3.is_empty(), "Sequence should not be empty");
    }

    #[test]
    fn test_solovay_kitaev_sequence_gates_valid() {
        // All gates in sequence should be from {H, T, Tdg, S, Sdg}
        let target = SU2::rotation_z(0.5);
        let decomposer = SOKDecomposer::new(10, 2);
        let seq = decomposer.decompose(&target).expect("decompose failed");

        for gate in &seq {
            assert!(
                ["H", "T", "Tdg", "S", "Sdg"].contains(gate),
                "Gate '{}' not in universal set {{H, T, Tdg, S, Sdg}}",
                gate
            );
        }
    }

    #[test]
    fn test_sequence_to_matrix_empty() {
        let m = sequence_to_matrix(&[]);
        assert!(
            m.frobenius_distance(&SU2::identity()) < EPS,
            "Empty sequence should give identity"
        );
    }

    #[test]
    fn test_sequence_to_matrix_h() {
        let m = sequence_to_matrix(&["H"]);
        let h = SU2::from_gate_name("H").expect("H must exist");
        assert!(m.frobenius_distance(&h) < EPS);
    }

    #[test]
    fn test_adjoint_gate_roundtrip() {
        assert_eq!(adjoint_gate("T"), "Tdg");
        assert_eq!(adjoint_gate("Tdg"), "T");
        assert_eq!(adjoint_gate("H"), "H");
        assert_eq!(adjoint_gate("S"), "Sdg");
        assert_eq!(adjoint_gate("Sdg"), "S");
    }

    #[test]
    fn test_rotation_about_axis_z() {
        // rotation_about_axis(0, 0, 1, theta) should equal rotation_z(theta)
        let theta = 0.7;
        let r1 = rotation_about_axis(0.0, 0.0, 1.0, theta);
        let r2 = SU2::rotation_z(theta);
        assert!(
            r1.frobenius_distance(&r2) < EPS,
            "rotation_about_axis(z) should match rotation_z"
        );
    }

    #[test]
    fn test_rotation_about_axis_x() {
        let theta = 1.2;
        let r1 = rotation_about_axis(1.0, 0.0, 0.0, theta);
        let r2 = SU2::rotation_x(theta);
        assert!(
            r1.frobenius_distance(&r2) < EPS,
            "rotation_about_axis(x) should match rotation_x"
        );
    }

    #[test]
    fn test_clifford_enumeration_size() {
        // Sanity check: the single-qubit Clifford group modulo global phase
        // has exactly 24 elements, generated by {H, S, Sdg}.
        let cliffords = enumerate_cliffords();
        assert_eq!(
            cliffords.len(),
            24,
            "Clifford group enumeration should yield exactly 24 elements"
        );
    }

    #[test]
    fn test_two_stage_table_covers_small_y_rotation() {
        // Coverage regression test: with the previous single-stage BFS at depth 10,
        // the table contained no element closer to RY(0.3) than the identity
        // (depth-0 dist = 0.149, seq_len = 0). The two-stage Clifford+T BFS
        // expands SU(2) coverage so RY(0.3) gets a non-trivial approximation.
        let table = BasicApproximationTable::build(10);
        let target = SU2::rotation_y(0.3);
        let (seq, dist) = table.find_closest(&target);
        assert!(
            !seq.is_empty(),
            "Table should cover RY(0.3) with a non-identity sequence (got seq_len=0, dist={dist:.6})"
        );
        // Distance should be meaningfully better than the identity bound (~0.150).
        assert!(
            dist < 0.10,
            "RY(0.3) approximation should beat identity bound 0.15, got {dist:.6}"
        );
    }
}