gam 0.3.104

Generalized penalized likelihood engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
use super::family::*;
use super::gradient_paths::*;
use super::hessian_paths::{new_cell_moment_cache_stats, new_cell_moment_lru_cache};
use super::install_flex::validate_spec;
use super::*;
use crate::faer_ndarray::{FaerEigh, fast_ab, fast_atb, fast_xt_diag_x};
use crate::families::marginal_slope_orthogonal::INFLUENCE_ABSORBER_FIXED_LOG_LAMBDA;
use faer::Side;

const BMS_PROBIT_SEPARATION_BETA_INF: f64 = 40.0;

// ── BlockEffectiveJacobian impls for BMS ─────────────────────────────────────
//
// BMS has a single Bernoulli output per row (n_outputs = 1). The observed η is
//
//   η_i = q_i · c_i + s·g_i · z_i
//
// where
//   q_i   = marginal_design[i,:] · β_m + offset_m[i]      (marginal η)
//   g_i   = logslope_design[i,:] · β_s + offset_s[i]      (log-slope η)
//   s     = probit_frailty_scale(gaussian_frailty_sd)
//   c_i   = sqrt(1 + (s·g_i)²)
//
// Per-block Jacobians ∂η_i / ∂β_block:
//
//   Marginal block  → ∂η_i/∂β_m = c_i · M_i
//     (M_i = marginal_design row i; c_i is β-dependent but does not involve β_m)
//
//   Logslope block  → ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G_i
//     (G_i = logslope_design row i)
//
// score_warp_dev and link_dev blocks use IFT-corrected η, but their
// contribution to the identifiability audit is captured by the raw design
// columns (the IFT correction adds a direction already in the anchor span at
// compile time). These blocks leave jacobian_callback = None and rely on
// effective_design (= raw design) for the flat audit.

/// β-dependent Jacobian for the BMS marginal block.
///
/// ∂η_i/∂β_m = c_i · M[i,:]
/// where c_i = sqrt(1 + (s · g_i)²),
///       g_i = G[i,:] · β_s + offset_s[i],
///       s   = state.probit_frailty_scale.
///
/// `probit_frailty_scale` is read from the evaluation state at call time (not
/// captured at construction) so the callback remains correct across outer-loop
/// σ updates without rebuilding the block spec.
///
/// Designs are pre-densified at construction to avoid repeated materialisation.
pub struct BmsMarginalJacobian {
    /// Dense marginal design: n × p_m.
    pub marginal_dense: Arc<Array2<f64>>,
    /// Dense logslope design: n × p_s.
    pub logslope_dense: Arc<Array2<f64>>,
    pub offset_m: Array1<f64>,
    pub offset_s: Array1<f64>,
    /// Number of marginal columns (= size of β_m slice in the full β vector).
    pub p_marginal: usize,
}

impl BmsMarginalJacobian {
    pub fn new(
        marginal_dense: Arc<Array2<f64>>,
        logslope_dense: Arc<Array2<f64>>,
        offset_m: Array1<f64>,
        offset_s: Array1<f64>,
        p_marginal: usize,
    ) -> Self {
        Self {
            marginal_dense,
            logslope_dense,
            offset_m,
            offset_s,
            p_marginal,
        }
    }
}

impl BlockEffectiveJacobian for BmsMarginalJacobian {
    fn effective_jacobian_at(
        &self,
        state: &FamilyLinearizationState<'_>,
    ) -> Result<Array2<f64>, String> {
        let beta = state.beta;
        let s = state.probit_frailty_scale;
        let p_m = self.p_marginal;
        let p_s_block = self.logslope_dense.ncols();
        let beta_s_raw = if beta.len() > p_m {
            &beta[p_m..]
        } else {
            &[][..]
        };
        let p_s_use = p_s_block.min(beta_s_raw.len());
        let beta_s = &beta_s_raw[..p_s_use];
        let n = self.marginal_dense.nrows();
        let p_block = self.marginal_dense.ncols();

        // ∂η_i/∂β_m = c_i · M[i,:], with c_i = sqrt(1 + (s·g_i)²) and
        //   g_i = G[i, :p_s_use] · β_s + offset_s[i].
        //
        // This block owns the logslope design and offset, so g_i — and hence
        // c_i — is fully self-computable at the current β for every row,
        // including β = 0 (where g_i = offset_s[i], which carries the fitted
        // logslope baseline and is generically nonzero).  There is no external
        // scalar this block cannot reconstruct, so the Jacobian is evaluated
        // directly from owned data with no caller-supplied contract.
        let mut out = Array2::<f64>::zeros((n, p_block));
        for i in 0..n {
            let g_i = self.offset_s[i]
                + self
                    .logslope_dense
                    .row(i)
                    .slice(ndarray::s![..p_s_use])
                    .dot(&ArrayView1::from(beta_s));
            let sg = s * g_i;
            let c_i = (1.0 + sg * sg).sqrt();
            // J[i,:] = c_i · M[i,:]
            let m_row = self.marginal_dense.row(i);
            out.row_mut(i).assign(&m_row.mapv(|x| c_i * x));
        }
        Ok(out)
    }

    fn n_outputs(&self) -> usize {
        1
    }
}

/// β-dependent Jacobian for the BMS logslope block.
///
/// ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G[i,:]
/// where q_i = M[i,:] · β_m + offset_m[i],
///       g_i = G[i,:] · β_s + offset_s[i],
///       c_i = sqrt(1 + (s·g_i)²),
///       s   = state.probit_frailty_scale.
///
/// `probit_frailty_scale` is read from the evaluation state at call time.
///
/// Designs are pre-densified at construction to avoid repeated materialisation.
pub struct BmsLogslopeJacobian {
    /// Dense marginal design: n × p_m.
    pub marginal_dense: Arc<Array2<f64>>,
    /// Dense logslope design: n × p_s.
    pub logslope_dense: Arc<Array2<f64>>,
    pub offset_m: Array1<f64>,
    pub offset_s: Array1<f64>,
    pub z: Arc<Array1<f64>>,
    /// Number of marginal columns (= start of β_s in the full β vector).
    pub p_marginal: usize,
}

impl BmsLogslopeJacobian {
    pub fn new(
        marginal_dense: Arc<Array2<f64>>,
        logslope_dense: Arc<Array2<f64>>,
        offset_m: Array1<f64>,
        offset_s: Array1<f64>,
        z: Arc<Array1<f64>>,
        p_marginal: usize,
    ) -> Self {
        Self {
            marginal_dense,
            logslope_dense,
            offset_m,
            offset_s,
            z,
            p_marginal,
        }
    }
}

impl BlockEffectiveJacobian for BmsLogslopeJacobian {
    fn effective_jacobian_at(
        &self,
        state: &FamilyLinearizationState<'_>,
    ) -> Result<Array2<f64>, String> {
        let beta = state.beta;
        let s = state.probit_frailty_scale;
        let p_m = self.p_marginal;
        let p_m_use = p_m.min(beta.len());
        let beta_m = &beta[..p_m_use];
        let beta_s_raw = if beta.len() > p_m {
            &beta[p_m..]
        } else {
            &[][..]
        };
        let p_s_block = self.logslope_dense.ncols();
        let p_s_use = p_s_block.min(beta_s_raw.len());
        let beta_s = &beta_s_raw[..p_s_use];
        let n = self.logslope_dense.nrows();

        // ∂η_i/∂β_s = (q_i · s²·g_i / c_i + s·z_i) · G[i,:] where
        //   q_i = M[i,:] · β_m + offset_m[i],
        //   g_i = G[i,:] · β_s + offset_s[i],
        //   c_i = sqrt(1 + (s·g_i)²),
        //   z_i = self.z[i].
        //
        // This block owns the marginal design, logslope design, both offsets,
        // and z, so q_i, g_i, c_i, z_i are all closed-form functions of the
        // current β and owned data — for every row at every β, including β = 0
        // (where g_i = offset_s[i] carries the nonzero fitted logslope
        // baseline).  The Jacobian is therefore evaluated directly from owned
        // data with no caller-supplied scalar contract.
        let mut out = Array2::<f64>::zeros((n, p_s_block));
        for i in 0..n {
            let q_i = self.offset_m[i]
                + self
                    .marginal_dense
                    .row(i)
                    .slice(ndarray::s![..p_m_use])
                    .dot(&ArrayView1::from(beta_m));
            let g_i = self.offset_s[i]
                + self
                    .logslope_dense
                    .row(i)
                    .slice(ndarray::s![..p_s_use])
                    .dot(&ArrayView1::from(beta_s));
            let sg = s * g_i;
            let c_i = (1.0 + sg * sg).sqrt();
            let z_i = self.z[i];
            // per-row scalar factor: q_i · s²·g_i / c_i + s·z_i
            let factor = q_i * s * s * g_i / c_i + s * z_i;
            // J[i,:] = factor · G[i,:]
            let g_row = self.logslope_dense.row(i);
            out.row_mut(i).assign(&g_row.mapv(|x| factor * x));
        }
        Ok(out)
    }

    fn n_outputs(&self) -> usize {
        1
    }
}

/// Horizontally stack the absorbed influence columns `Z̃_infl` onto the raw
/// marginal design `M`, yielding the widened additive marginal-index design
/// `[M | Z̃_infl]` (#461). When `influence_columns` is `None` the original
/// dense design is returned unchanged. The influence columns shift the marginal
/// index `α(x)` additively, so the de-nested probit kernel — which reads the
/// marginal index from `block_states[0].eta` and reconstructs `∂q/∂β_m` from
/// `self.marginal_design` (a matched (design, β) pair) — picks them up with no
/// kernel-site change; the widened `p_marginal` keeps every per-row Jacobian /
/// gradient / Hessian projection consistent.
fn widen_marginal_dense_with_influence(
    marginal_dense: &Arc<Array2<f64>>,
    influence_columns: Option<&Array2<f64>>,
) -> Result<Arc<Array2<f64>>, String> {
    let Some(z_infl) = influence_columns else {
        return Ok(Arc::clone(marginal_dense));
    };
    let n = marginal_dense.nrows();
    if z_infl.nrows() != n {
        return Err(format!(
            "influence block: residualised columns have {} rows, marginal design has {n}",
            z_infl.nrows()
        ));
    }
    let p_m = marginal_dense.ncols();
    let p1 = z_infl.ncols();
    let mut widened = Array2::<f64>::zeros((n, p_m + p1));
    widened
        .slice_mut(s![.., ..p_m])
        .assign(marginal_dense.as_ref());
    widened.slice_mut(s![.., p_m..]).assign(z_infl);
    Ok(Arc::new(widened))
}

/// Tolerance (relative to the dominant retained eigenvalue) below which a
/// reduced-basis direction of the W-orthogonalised effective logslope Gram is
/// treated as a confounded null direction and dropped. Directions whose
/// effective weighted image is (near-)explained by the marginal span collapse to
/// ~0 eigenvalue in `Gtt` (see [`build_reduced_logslope_reparam`]); this keeps
/// the cut well above floating-point noise but well below any genuine surviving
/// logslope curvature.
const LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL: f64 = 1.0e-6;

/// An exact reduced-basis reparameterization of the BMS logslope design through
/// the family's OWN internal `logslope_design` geometry, expressed as a single
/// linear map `T` (`p_logslope × r`, `r ≤ p_logslope`).
///
/// # Why a reduced basis (not a dense design swap)
///
/// The structural confound is that the score-weighted logslope channel
/// `diag(factor)·G·β_s` overlaps the effective marginal channel `diag(c)·M·β_m`
/// in the PIRLS row metric `W`, leaving the joint penalised Hessian rank-soft
/// along the shared direction. The shared-solver primitive
/// [`OrthogonalReparam`](crate::solver::orthogonal_reparam::OrthogonalReparam)
/// forms `C̃ = C − M·B`, exactly W-orthogonal to `span(M)` — but `C̃` is a dense
/// design the BMS family's row kernel does NOT consume: the family reads
/// `η_logslope = G·β_s` from its own `logslope_design` and reconstructs the
/// per-row Jacobian `factor_i · G_i` from that same matrix. A block-level design
/// swap is therefore ignored by the family, and feeding a rank-deficient `C̃` at
/// full width desynchronises the inner identifiable-subspace reduction from the
/// stored design width.
///
/// This builds instead a TRUE reparameterization the family consumes: a
/// full-rank reduced logslope design `G_reduced = G·T` (width `r`) plus the
/// penalty projection `S_reduced = Tᵀ S T`. The map is constructed so that the
/// directions of raw logslope coefficient space whose effective weighted image
/// is W-explained by the marginal span are removed (they carry ~zero curvature
/// in the W-orthogonalised effective Gram), and the surviving `r` directions are
/// full-rank.
///
/// # The math
///
/// At the rigid pilot, the effective Jacobians are
///
/// ```text
///     M_eff = diag(c) · M        (n × p_m),   c_i = sqrt(1 + (s·g_i)²)
///     G_eff = diag(f) · G        (n × p_g),   f_i = q_i·s²·g_i/c_i + s·z_i
/// ```
///
/// In the row metric `W` the component of the effective logslope design that is
/// W-orthogonal to `span(M_eff)` has the raw-coordinate Gram
///
/// ```text
///     Gtt = G_effᵀ W G_eff − (G_effᵀ W M_eff)(M_effᵀ W M_eff + εI)⁻¹(M_effᵀ W G_eff)
/// ```
///
/// (a `p_g × p_g` PSD matrix in the raw logslope coefficient coordinates). Its
/// range = the logslope directions that survive the confound removal; its null
/// space = the confounded directions absorbed by the marginal span. The reduced
/// transform `T` is the orthonormal eigenbasis of `Gtt` for eigenvalues above a
/// relative tolerance; `r = rank(Gtt)`. The new design `G_reduced = G·T`, the
/// reparameterized penalty `S_reduced = Tᵀ S T`, and the round-trip
/// `β_logslope = T·β'` make the family's geometry consistent at width `r` and
/// recover the original-basis logslope coefficients for prediction/reporting.
#[derive(Debug, Clone)]
pub(super) struct ReducedLogslopeReparam {
    /// Reduced transform `T` (`p_logslope × r`). `G_reduced = G·T`,
    /// `β_logslope = T·β'`, `S_reduced = Tᵀ S T`.
    transform: Array2<f64>,
}

impl ReducedLogslopeReparam {
    /// Original (full) logslope width `p_logslope`.
    #[inline]
    pub(super) fn original_cols(&self) -> usize {
        self.transform.nrows()
    }

    /// Reduced width `r`.
    #[inline]
    pub(super) fn reduced_cols(&self) -> usize {
        self.transform.ncols()
    }

    /// Map a reduced-basis logslope coefficient `β'` (length `r`) back to the
    /// original logslope basis `β_logslope = T·β'` (length `p_logslope`), so
    /// prediction/reporting are unchanged-in-meaning.
    pub(super) fn recover_original_logslope_beta(
        &self,
        beta_reduced: &Array1<f64>,
    ) -> Result<Array1<f64>, String> {
        if beta_reduced.len() != self.reduced_cols() {
            return Err(format!(
                "reduced logslope reparam: β' length ({}) != reduced width ({})",
                beta_reduced.len(),
                self.reduced_cols()
            ));
        }
        Ok(self.transform.dot(beta_reduced))
    }
}

/// Build the reduced-basis logslope reparameterization (see
/// [`ReducedLogslopeReparam`]) from the rigid-pilot effective Jacobian geometry,
/// in the PIRLS row metric `W`. Returns `Ok(None)` when there is no logslope
/// span, no marginal span, or no confounded direction to remove (`r == p_g`):
/// in those cases the raw design already is its own reduced basis and the caller
/// keeps it unchanged.
fn build_reduced_logslope_reparam(
    marginal_design: &TermCollectionDesign,
    logslope_design: &TermCollectionDesign,
    z: &Array1<f64>,
    row_metric: &Array1<f64>,
    marginal_offset: &Array1<f64>,
    logslope_offset: &Array1<f64>,
    marginal_baseline: f64,
    logslope_baseline: f64,
    probit_scale: f64,
) -> Result<Option<ReducedLogslopeReparam>, String> {
    let marginal = marginal_design
        .design
        .try_to_dense_arc("build_reduced_logslope_reparam::marginal")?;
    let logslope = logslope_design
        .design
        .try_to_dense_arc("build_reduced_logslope_reparam::logslope")?;
    let n = marginal.nrows();
    if logslope.nrows() != n
        || z.len() != n
        || row_metric.len() != n
        || marginal_offset.len() != n
        || logslope_offset.len() != n
    {
        return Err(format!(
            "reduced logslope reparam row mismatch: marginal={}, logslope={}, z={}, row_metric={}, marginal_offset={}, logslope_offset={}",
            marginal.nrows(),
            logslope.nrows(),
            z.len(),
            row_metric.len(),
            marginal_offset.len(),
            logslope_offset.len(),
        ));
    }
    let p_m = marginal.ncols();
    let p_g = logslope.ncols();
    if p_m == 0 || p_g == 0 {
        return Ok(None);
    }
    if !marginal_baseline.is_finite()
        || !logslope_baseline.is_finite()
        || !probit_scale.is_finite()
        || probit_scale <= 0.0
        || z.iter().any(|v| !v.is_finite())
        || row_metric.iter().any(|v| !v.is_finite() || *v < 0.0)
        || marginal_offset.iter().any(|v| !v.is_finite())
        || logslope_offset.iter().any(|v| !v.is_finite())
    {
        return Err(
            "reduced logslope reparam requires finite pilot geometry and finite non-negative row metric"
                .to_string(),
        );
    }

    // W-orthogonalize the RAW logslope design `G` against the RAW marginal
    // design `M` in the PIRLS row metric `W` (the metric the joint Hessian
    // sees), via the shared-solver primitive. `C̃ = G − M·B` with `Mᵀ W C̃ = 0`
    // exactly — the released solver's pinned overlap ridge merely penalised this
    // shared direction; here it is removed by construction. `C̃` is the SAME
    // width `p_g` but rank-deficient by exactly the dimension of the
    // marginal-overlapping subspace of `G`.
    let reparam = crate::solver::orthogonal_reparam::OrthogonalReparam::build_unconditional(
        marginal.view(),
        logslope.view(),
        row_metric,
    )?;
    let c_tilde = reparam.reparameterized_confound().to_owned(); // n × p_g

    // Build a FULL-RANK reduced basis `T` (p_g × r) of the column space of the
    // raw-coordinate shear `(I − B-projection)` that produced `C̃`. Concretely
    // `C̃ = G·E` where `E = I − Bᵀ·(Mᵀ ... )`… rather than reconstruct `E`
    // algebraically, recover the raw-coordinate reduced basis directly from the
    // W-Gram of `C̃` expressed in raw logslope coordinates:
    //
    //   Stt = C̃ᵀ W C̃   (p_g × p_g, PSD), the raw-coordinate Gram of the
    //   W-orthogonal component of the logslope design.
    //
    // Its range = the logslope directions surviving the marginal-overlap
    // removal; its null space = the directions `G` shares with `span_W(M)`. A
    // direction `v` with `Stt v ≈ 0` is one whose raw logslope column `G·v` is
    // (W-)explained by the marginal span — exactly the confounded direction the
    // joint Hessian is rank-soft along. The reduced transform is the orthonormal
    // eigenbasis of `Stt` for eigenvalues above a relative tolerance.
    let stt = fast_xt_diag_x(&c_tilde, row_metric);
    let stt = (&stt + &stt.t()) * 0.5;
    if stt.iter().any(|v| !v.is_finite()) {
        return Err("reduced logslope reparam: C̃ W-Gram produced non-finite entries".to_string());
    }
    let raw_gram = fast_xt_diag_x(&logslope, row_metric);
    let raw_scale = (0..p_g).map(|i| raw_gram[[i, i]]).fold(0.0_f64, f64::max);
    let (evals, evecs) = stt
        .eigh(Side::Lower)
        .map_err(|e| format!("reduced logslope reparam: eigendecomposition failed: {e:?}"))?;
    // Tolerance relative to the RAW logslope self-Gram scale: a `C̃`-Gram
    // eigenvalue far below the raw logslope energy scale means that direction's
    // logslope column was almost entirely W-explained by the marginal span.
    if !raw_scale.is_finite() || raw_scale <= 0.0 {
        return Ok(None);
    }
    let tol = raw_scale * LOGSLOPE_REDUCED_BASIS_RELATIVE_TOL;
    let mut kept: Vec<usize> = (0..evals.len()).filter(|&i| evals[i] > tol).collect();
    kept.sort_by(|&a, &b| {
        evals[b]
            .partial_cmp(&evals[a])
            .unwrap_or(std::cmp::Ordering::Equal)
    });
    let r = kept.len();
    // No confounded direction to remove ⇒ the raw design already is full-rank
    // and well-conditioned; keep it unchanged (byte-identical block geometry).
    if r == p_g || r == 0 {
        return Ok(None);
    }
    let mut transform = Array2::<f64>::zeros((p_g, r));
    for (out_col, &src) in kept.iter().enumerate() {
        transform.column_mut(out_col).assign(&evecs.column(src));
    }
    if transform.iter().any(|v| !v.is_finite()) {
        return Err(
            "reduced logslope reparam: reduced transform produced non-finite entries".to_string(),
        );
    }
    Ok(Some(ReducedLogslopeReparam { transform }))
}

/// Apply a [`ReducedLogslopeReparam`] to a logslope `TermCollectionDesign`,
/// producing a new design at the reduced width `r`: the design becomes
/// `G_reduced = G·T`, and every blockwise penalty `S` is reparameterized to
/// `S_reduced = Tᵀ S T` over the full reduced column range `0..r`. The reduced
/// penalty's null space is recomputed from its numerical rank so the REML
/// log-determinant accounting stays consistent at the reduced width.
fn reparameterize_logslope_design_reduced(
    logslope_design: &TermCollectionDesign,
    reparam: &ReducedLogslopeReparam,
) -> Result<TermCollectionDesign, String> {
    let g = logslope_design
        .design
        .try_to_dense_arc("reparameterize_logslope_design_reduced::logslope")?;
    let p_g = g.ncols();
    if p_g != reparam.original_cols() {
        return Err(format!(
            "reduced logslope reparam width mismatch: design has {p_g} cols, transform expects {}",
            reparam.original_cols()
        ));
    }
    let t = &reparam.transform;
    let r = reparam.reduced_cols();
    // G_reduced = G·T   (n × r).
    let g_reduced = fast_ab(&g, t);

    // Reparameterize each penalty: embed its local block at full width p_g, then
    // form S_reduced = Tᵀ S T (r × r) over the whole reduced column range.
    let mut new_penalties: Vec<crate::terms::smooth::BlockwisePenalty> =
        Vec::with_capacity(logslope_design.penalties.len());
    let mut new_nullspace_dims: Vec<usize> = Vec::with_capacity(logslope_design.penalties.len());
    for bp in &logslope_design.penalties {
        let mut full = Array2::<f64>::zeros((p_g, p_g));
        full.slice_mut(s![bp.col_range.clone(), bp.col_range.clone()])
            .assign(&bp.local);
        // S_reduced = Tᵀ (S) T.
        let st = fast_ab(&full, t); // p_g × r
        let mut s_reduced = fast_atb(t, &st); // r × r
        s_reduced = (&s_reduced + &s_reduced.t()) * 0.5;
        // Null-space dimension of the reduced penalty = r − rank(S_reduced).
        let (evals, _) = s_reduced
            .eigh(Side::Lower)
            .map_err(|e| format!("reduced logslope penalty eigendecomposition failed: {e:?}"))?;
        let max_eval = evals.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs()));
        let pen_tol = (max_eval * 1.0e-12).max(f64::EPSILON);
        let rank = evals.iter().filter(|&&v| v.abs() > pen_tol).count();
        let nullspace_dim = r.saturating_sub(rank);
        new_penalties.push(crate::terms::smooth::BlockwisePenalty::new(0..r, s_reduced));
        new_nullspace_dims.push(nullspace_dim);
    }

    let new_design = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(g_reduced));
    // The reduced logslope block is a single dense smooth-like surface over the
    // reparameterized coordinates; it carries no parametric/random-effect/
    // intercept structure of its own (those live in the marginal block), so the
    // structural ranges collapse to empty and the smooth metadata is cleared.
    // The penalties + nullspace_dims above are what the joint REML consumes.
    Ok(TermCollectionDesign {
        design: new_design,
        penalties: new_penalties,
        nullspace_dims: new_nullspace_dims,
        penaltyinfo: Vec::new(),
        dropped_penaltyinfo: Vec::new(),
        coefficient_lower_bounds: None,
        linear_constraints: None,
        intercept_range: 0..0,
        linear_ranges: Vec::new(),
        random_effect_ranges: Vec::new(),
        random_effect_levels: Vec::new(),
        smooth: crate::terms::smooth::SmoothDesign {
            term_designs: Vec::new(),
            penalties: Vec::new(),
            nullspace_dims: Vec::new(),
            penaltyinfo: Vec::new(),
            dropped_penaltyinfo: Vec::new(),
            terms: Vec::new(),
            coefficient_lower_bounds: None,
            linear_constraints: None,
        },
    })
}

/// Re-embed the term-collection marginal penalties at the (possibly widened)
/// block dimension `p_m [+ p₁]`, then append the #461 fixed-ridge absorber:
///
///  (#461, only with influence columns) the fixed-ridge absorber identity on
///  the influence columns `p_m..p_m+p₁`.
///
/// The two former gam#754 pinned ridges — the marginal nullspace-shrinkage ridge
/// and the marginal↔logslope overlap ridge — are DELETED: robustness is now
/// unconditional, so the full-identifiable-span Jeffreys term (`Z_J = I`, see
/// `jeffreys_subspace_from_penalty`) supplies automatic O(n)-scaled curvature on
/// every under-identified direction (subsuming the nullspace ridge), and the
/// exact orthogonal reparameterization of the logslope design (now unconditional,
/// see `build_reduced_logslope_reparam`) resolves the marginal↔logslope confound
/// by construction (subsuming the overlap ridge).
///
/// The genuine marginal smooth penalties keep their `col_range` (marginal
/// columns stay in `0..p_m`). Returns `(penalties, nullspace_dims,
/// initial_log_lambdas)` to install on the marginal block. Fixed ridges remain
/// in this physical penalty layout and are removed from every REML/outer
/// coordinate vector by [`PenaltyMatrix::Fixed`].
fn marginal_penalties_with_influence_ridge(
    design: &TermCollectionDesign,
    rho_marginal: &Array1<f64>,
    influence_columns: Option<&Array2<f64>>,
    influence_ridge_log_lambda: f64,
) -> Result<(Vec<PenaltyMatrix>, Vec<usize>, Array1<f64>), String> {
    let p_m = design.design.ncols();
    let p1 = influence_columns.map(|z| z.ncols()).unwrap_or(0);
    let total_dim = p_m + p1;
    // Re-embed each marginal penalty at the (widened) total dimension (col_range
    // unchanged: marginal columns remain 0..p_m).
    let mut penalties: Vec<PenaltyMatrix> = design
        .penalties
        .iter()
        .map(|bp| PenaltyMatrix::from_blockwise(bp.clone(), total_dim))
        .collect();
    let mut nullspace_dims = design.nullspace_dims.clone();
    let mut log_lambdas = rho_marginal.to_vec();

    // (#461) fixed-ridge absorber: identity on the influence columns only.
    // Full rank (nullspace 0); its log λ is pinned out of REML by a degenerate
    // ρ box.
    if p1 > 0 {
        penalties.push(
            PenaltyMatrix::Blockwise {
                local: Array2::<f64>::eye(p1),
                col_range: p_m..total_dim,
                total_dim,
            }
            .with_fixed_log_lambda(influence_ridge_log_lambda),
        );
        nullspace_dims.push(0);
        log_lambdas.push(influence_ridge_log_lambda);
    }

    Ok((penalties, nullspace_dims, Array1::from_vec(log_lambdas)))
}

/// Widen an optional β warm-start hint to the influence-widened marginal
/// dimension, zero-filling the absorber coefficients `γ` (#461).
fn widen_marginal_beta_hint(
    beta_hint: Option<Array1<f64>>,
    p_marginal_widened: usize,
) -> Option<Array1<f64>> {
    beta_hint.map(|hint| {
        if hint.len() == p_marginal_widened {
            hint
        } else {
            let mut widened = Array1::<f64>::zeros(p_marginal_widened);
            let copy = hint.len().min(p_marginal_widened);
            widened
                .slice_mut(s![..copy])
                .assign(&hint.slice(s![..copy]));
            widened
        }
    })
}

fn argmax_by_abs<I>(values: I) -> Option<(String, usize, f64)>
where
    I: IntoIterator<Item = (String, usize, f64)>,
{
    values
        .into_iter()
        .map(|(label, idx, value)| (label, idx, value.abs()))
        .filter(|(_, _, abs)| abs.is_finite())
        .max_by(|left, right| {
            left.2
                .partial_cmp(&right.2)
                .unwrap_or(std::cmp::Ordering::Equal)
        })
}

fn marginal_parametric_argmax_from_beta(
    beta: &Array1<f64>,
    design: &TermCollectionDesign,
    spec: &TermCollectionSpec,
) -> Option<(String, usize, f64)> {
    let mut entries = Vec::<(String, usize, f64)>::new();
    if design.intercept_range.len() == 1 {
        let idx = design.intercept_range.start;
        if idx < beta.len() {
            entries.push(("intercept".to_string(), idx, beta[idx]));
        }
    }
    for (linear, (name, range)) in spec.linear_terms.iter().zip(design.linear_ranges.iter()) {
        if linear.double_penalty {
            continue;
        }
        for local_col in range.clone() {
            if local_col < beta.len() {
                entries.push((name.clone(), local_col, beta[local_col]));
            }
        }
    }
    argmax_by_abs(entries)
}

fn marginal_parametric_argmax_from_warm_start(
    warm_start: &CustomFamilyWarmStart,
    design: &TermCollectionDesign,
    spec: &TermCollectionSpec,
) -> Option<(String, usize, f64)> {
    let mut entries = Vec::<(String, usize, f64)>::new();
    if design.intercept_range.len() == 1
        && let Some((idx, abs)) =
            warm_start.block_beta_abs_argmax_in_range(0, design.intercept_range.clone())
    {
        entries.push(("intercept".to_string(), idx, abs));
    }
    for (linear, (name, range)) in spec.linear_terms.iter().zip(design.linear_ranges.iter()) {
        if linear.double_penalty {
            continue;
        }
        if let Some((idx, abs)) = warm_start.block_beta_abs_argmax_in_range(0, range.clone()) {
            entries.push((name.clone(), idx, abs));
        }
    }
    argmax_by_abs(entries)
}

fn marginal_full_argmax_from_beta(
    beta: &Array1<f64>,
    design: &TermCollectionDesign,
) -> Option<(String, usize, f64)> {
    let mut entries = Vec::<(String, usize, f64)>::new();
    if design.intercept_range.len() == 1 {
        let idx = design.intercept_range.start;
        if idx < beta.len() {
            entries.push(("intercept".to_string(), idx, beta[idx]));
        }
    }
    for (name, range) in &design.linear_ranges {
        for local_col in range.clone() {
            if local_col < beta.len() {
                entries.push((name.clone(), local_col, beta[local_col]));
            }
        }
    }
    for (name, range) in &design.random_effect_ranges {
        for local_col in range.clone() {
            if local_col < beta.len() {
                entries.push((name.clone(), local_col, beta[local_col]));
            }
        }
    }
    let smooth_start = design
        .design
        .ncols()
        .saturating_sub(design.smooth.total_smooth_cols());
    for term in &design.smooth.terms {
        let label = format!("smooth '{}'", term.name);
        let start = smooth_start + term.coeff_range.start;
        let end = smooth_start + term.coeff_range.end;
        for local_col in start..end {
            if local_col < beta.len() {
                entries.push((label.clone(), local_col, beta[local_col]));
            }
        }
    }
    for local_col in design.design.ncols()..beta.len() {
        entries.push((
            "fixed-ridge influence absorber".to_string(),
            local_col,
            beta[local_col],
        ));
    }
    argmax_by_abs(entries)
}

fn marginal_full_argmax_from_warm_start(
    warm_start: &CustomFamilyWarmStart,
    design: &TermCollectionDesign,
) -> Option<(String, usize, f64)> {
    let block_width = warm_start.block_beta_len(0)?;
    let mut entries = Vec::<(String, usize, f64)>::new();
    if design.intercept_range.len() == 1
        && let Some((idx, abs)) =
            warm_start.block_beta_abs_argmax_in_range(0, design.intercept_range.clone())
    {
        entries.push(("intercept".to_string(), idx, abs));
    }
    for (name, range) in &design.linear_ranges {
        if let Some((idx, abs)) = warm_start.block_beta_abs_argmax_in_range(0, range.clone()) {
            entries.push((name.clone(), idx, abs));
        }
    }
    for (name, range) in &design.random_effect_ranges {
        if let Some((idx, abs)) = warm_start.block_beta_abs_argmax_in_range(0, range.clone()) {
            entries.push((name.clone(), idx, abs));
        }
    }
    let smooth_start = design
        .design
        .ncols()
        .saturating_sub(design.smooth.total_smooth_cols());
    for term in &design.smooth.terms {
        let range = (smooth_start + term.coeff_range.start)..(smooth_start + term.coeff_range.end);
        if let Some((idx, abs)) = warm_start.block_beta_abs_argmax_in_range(0, range) {
            entries.push((format!("smooth '{}'", term.name), idx, abs));
        }
    }
    if block_width > design.design.ncols()
        && let Some((idx, abs)) =
            warm_start.block_beta_abs_argmax_in_range(0, design.design.ncols()..block_width)
    {
        entries.push(("fixed-ridge influence absorber".to_string(), idx, abs));
    }
    argmax_by_abs(entries)
}

fn bernoulli_marginal_slope_runaway_error_from_argmax(
    parametric_argmax: Option<(String, usize, f64)>,
    block_argmax: Option<(String, usize, f64)>,
    inner_status: &str,
    eval_label: &str,
) -> Option<String> {
    let (label, local_col, beta_abs, explanation) = if let Some((label, local_col, beta_abs)) =
        parametric_argmax
        && beta_abs >= BMS_PROBIT_SEPARATION_BETA_INF
    {
        (
            label,
            local_col,
            beta_abs,
            "an unpenalized parametric marginal direction has no stable finite probit optimum",
        )
    } else if let Some((label, local_col, beta_abs)) = block_argmax
        && beta_abs >= BMS_PROBIT_SEPARATION_BETA_INF
    {
        (
            label,
            local_col,
            beta_abs,
            "a marginal smooth direction is trading off against the logslope surface; this is the under-constrained marginal/logslope coupling that appears when the score is correlated with the shared surface covariates",
        )
    } else {
        return None;
    };
    if beta_abs < BMS_PROBIT_SEPARATION_BETA_INF {
        return None;
    }
    Some(format!(
        "bernoulli marginal-slope probit marginal/logslope runaway detected in block \
         'marginal_surface' during {eval_label}: term '{label}' \
         (local column {local_col}) has \
         |β|∞={beta_abs:.3e} (diagnostic threshold \
         {BMS_PROBIT_SEPARATION_BETA_INF:.1}). The joint design is identifiable; \
         {explanation}. {inner_status}. The robust Jeffreys curvature path is \
         already installed for this fit, so this diagnostic means the current \
         coupled surface still exposes a separation-scale direction rather than \
         a request for an external bias-reduction prior. Reduce or \
         reparameterize the coupled marginal/logslope surface, or use a \
         lower-dimensional logslope interaction. This is not a \
         Matérn/Duchon polynomial-nullspace or cross-block gauge-priority \
         failure."
    ))
}

fn bernoulli_marginal_slope_runaway_error(
    warm_start: &CustomFamilyWarmStart,
    design: &TermCollectionDesign,
    spec: &TermCollectionSpec,
    inner_converged: bool,
    eval_label: &str,
) -> Option<String> {
    let inner_status = if inner_converged {
        "the inner solve reached a KKT certificate at a separation-scale coefficient"
    } else {
        "the inner solve failed while already carrying a separation-scale coefficient"
    };
    bernoulli_marginal_slope_runaway_error_from_argmax(
        marginal_parametric_argmax_from_warm_start(warm_start, design, spec),
        marginal_full_argmax_from_warm_start(warm_start, design),
        inner_status,
        eval_label,
    )
}

#[cfg(test)]
mod runaway_tests {
    use super::*;
    use crate::faer_ndarray::{FaerArrayView, factorize_symmetricwith_fallback, fast_xt_diag_y};

    // The marginal↔logslope overlap penalty is no longer installed as a pinned
    // ridge (subsumed by the now-unconditional exact logslope orthogonalisation in
    // `build_reduced_logslope_reparam`). The geometry helper is retained here under
    // the test module because the basis-independence/weight-orthogonality unit tests
    // below exercise it directly as the canonical overlap-direction reference.
    fn marginal_logslope_overlap_penalty(
        marginal_design: &DesignMatrix,
        logslope_design: &DesignMatrix,
        z: &Array1<f64>,
        row_metric: &Array1<f64>,
        marginal_offset: &Array1<f64>,
        logslope_offset: &Array1<f64>,
        marginal_baseline: f64,
        logslope_baseline: f64,
        probit_scale: f64,
    ) -> Result<Option<Array2<f64>>, String> {
        let marginal =
            marginal_design.try_to_dense_arc("marginal_logslope_overlap_penalty::marginal")?;
        let logslope =
            logslope_design.try_to_dense_arc("marginal_logslope_overlap_penalty::logslope")?;
        let n = marginal.nrows();
        if logslope.nrows() != n
            || z.len() != n
            || row_metric.len() != n
            || marginal_offset.len() != n
            || logslope_offset.len() != n
        {
            return Err(format!(
                "marginal/logslope overlap penalty row mismatch: marginal={}, logslope={}, z={}, row_metric={}, marginal_offset={}, logslope_offset={}",
                marginal.nrows(),
                logslope.nrows(),
                z.len(),
                row_metric.len(),
                marginal_offset.len(),
                logslope_offset.len(),
            ));
        }
        let p_m = marginal.ncols();
        let p_g = logslope.ncols();
        if p_m == 0 || p_g == 0 {
            return Ok(None);
        }
        if !marginal_baseline.is_finite()
            || !logslope_baseline.is_finite()
            || !probit_scale.is_finite()
            || probit_scale <= 0.0
            || z.iter().any(|v| !v.is_finite())
            || row_metric.iter().any(|v| !v.is_finite() || *v < 0.0)
            || marginal_offset.iter().any(|v| !v.is_finite())
            || logslope_offset.iter().any(|v| !v.is_finite())
        {
            return Err(
                "marginal/logslope overlap penalty requires finite pilot geometry and finite non-negative row metric"
                    .to_string(),
            );
        }

        let mut marginal_effective = Array2::<f64>::zeros((n, p_m));
        let mut effective_logslope = Array2::<f64>::zeros((n, p_g));
        for i in 0..n {
            let q_i = marginal_offset[i] + marginal_baseline;
            let g_i = logslope_offset[i] + logslope_baseline;
            let sg = probit_scale * g_i;
            let c_i = (1.0 + sg * sg).sqrt();
            let logslope_factor =
                q_i * probit_scale * probit_scale * g_i / c_i + probit_scale * z[i];
            for j in 0..p_m {
                marginal_effective[[i, j]] = c_i * marginal[[i, j]];
            }
            for j in 0..p_g {
                effective_logslope[[i, j]] = logslope_factor * logslope[[i, j]];
            }
        }
        if effective_logslope.iter().all(|v| v.abs() <= f64::EPSILON) {
            return Ok(None);
        }

        let mut gram = fast_xt_diag_x(&effective_logslope, row_metric);
        let gram_scale = gram.diag().iter().copied().fold(0.0_f64, f64::max);
        if !gram_scale.is_finite() || gram_scale <= 0.0 {
            return Ok(None);
        }
        let projection_ridge = (gram_scale * 1.0e-10).max(f64::EPSILON);
        for i in 0..p_g {
            gram[[i, i]] += projection_ridge;
        }
        let cross = fast_xt_diag_y(&effective_logslope, row_metric, &marginal_effective);
        let gram_view = FaerArrayView::new(&gram);
        let factor = factorize_symmetricwith_fallback(gram_view.as_ref(), Side::Lower)
            .map_err(|e| format!("marginal/logslope overlap Gram factorization failed: {e}"))?;
        let rhsview = FaerArrayView::new(&cross);
        let coeffs_mat = factor.solve(rhsview.as_ref());
        let coeffs = Array2::from_shape_fn((p_g, p_m), |(i, j)| coeffs_mat[(i, j)]);
        let projected_marginal = fast_ab(&effective_logslope, &coeffs);
        let mut penalty = fast_xt_diag_y(&marginal_effective, row_metric, &projected_marginal);
        penalty = (&penalty + &penalty.t()) * 0.5;
        let max_abs = penalty.iter().map(|v| v.abs()).fold(0.0_f64, f64::max);
        if !max_abs.is_finite() || max_abs <= 1.0e-12 {
            return Ok(None);
        }
        Ok(Some(penalty))
    }

    #[test]
    fn spatial_joint_setup_counts_only_learned_penalties_in_rho() {
        let data = Array2::<f64>::zeros((3, 1));
        let empty_terms = TermCollectionSpec {
            linear_terms: Vec::new(),
            random_effect_terms: Vec::new(),
            smooth_terms: Vec::new(),
        };
        let setup = joint_setup(
            data.view(),
            &empty_terms,
            &empty_terms,
            2,
            3,
            &[0.4],
            &SpatialLengthScaleOptimizationOptions::default(),
        );

        assert_eq!(
            setup.rho_dim(),
            6,
            "BMS spatial setup rho must contain only learned marginal/logslope/auxiliary penalties; fixed physical ridges are carried by PenaltyMatrix::Fixed"
        );
    }

    #[test]
    fn overlap_penalty_targets_score_weighted_logslope_span() {
        let marginal = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_vec((4, 1), vec![0.0, 1.0, 2.0, 3.0]).unwrap(),
        ));
        let logslope = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_vec((4, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap(),
        ));
        let z = Array1::from_vec(vec![0.0, 1.0, 2.0, 3.0]);
        let row_metric = Array1::ones(4);
        let offsets = Array1::zeros(4);

        let penalty = marginal_logslope_overlap_penalty(
            &marginal,
            &logslope,
            &z,
            &row_metric,
            &offsets,
            &offsets,
            0.0,
            0.0,
            1.0,
        )
        .expect("overlap penalty should build")
        .expect("marginal signal lies in the pilot logslope Jacobian span");

        assert_eq!(penalty.dim(), (1, 1));
        assert!((penalty[[0, 0]] - 14.0).abs() < 1.0e-6);
    }

    #[test]
    fn overlap_penalty_skips_weight_orthogonal_channels() {
        let marginal = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_vec((4, 1), vec![-1.0, 1.0, -1.0, 1.0]).unwrap(),
        ));
        let logslope = DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            Array2::from_shape_vec((4, 1), vec![1.0, 1.0, 1.0, 1.0]).unwrap(),
        ));
        let z = Array1::ones(4);
        let row_metric = Array1::ones(4);
        let offsets = Array1::zeros(4);

        let penalty = marginal_logslope_overlap_penalty(
            &marginal,
            &logslope,
            &z,
            &row_metric,
            &offsets,
            &offsets,
            0.0,
            0.0,
            1.0,
        )
        .expect("overlap penalty should build");

        assert!(penalty.is_none());
    }

    #[test]
    fn runaway_diagnostic_names_unpenalized_parametric_direction_first() {
        let msg = bernoulli_marginal_slope_runaway_error_from_argmax(
            Some(("sex".to_string(), 1, 52.0)),
            Some(("smooth 'matern(PC1,PC2,PC3)'".to_string(), 7, 49.0)),
            "inner status",
            "unit-test eval",
        )
        .expect("parametric runaway should be diagnosed");

        assert!(msg.contains("term 'sex'"));
        assert!(msg.contains("unpenalized parametric marginal direction"));
        assert!(msg.contains("robust Jeffreys curvature path is already installed"));
        assert!(!msg.contains("explicit declared separation/bias-reduction prior"));
        assert!(msg.contains("not a Matérn/Duchon polynomial-nullspace"));
    }

    #[test]
    fn runaway_diagnostic_names_marginal_logslope_coupling_when_smooth_runs_away() {
        let msg = bernoulli_marginal_slope_runaway_error_from_argmax(
            Some(("sex".to_string(), 1, 2.0)),
            Some(("smooth 'marginal_surface[0]'".to_string(), 6, 51.4)),
            "inner status",
            "unit-test eval",
        )
        .expect("smooth runaway should be diagnosed");

        assert!(msg.contains("marginal/logslope runaway"));
        assert!(msg.contains("smooth 'marginal_surface[0]'"));
        assert!(msg.contains("score is correlated with the shared surface covariates"));
        assert!(msg.contains("not a Matérn/Duchon polynomial-nullspace"));
    }
}

fn build_marginal_blockspec_bms(
    design: &TermCollectionDesign,
    baseline: f64,
    offset: &Array1<f64>,
    rho: Array1<f64>,
    beta_hint: Option<Array1<f64>>,
    logslope_design: &TermCollectionDesign,
    logslope_offset: &Array1<f64>,
    logslope_baseline: f64,
    p_marginal: usize,
    influence_columns: Option<&Array2<f64>>,
    influence_ridge_log_lambda: f64,
) -> Result<ParameterBlockSpec, String> {
    let offset_m = offset + baseline;
    let offset_s = logslope_offset + logslope_baseline;
    let raw_marginal_dense = design
        .design
        .try_to_dense_arc("build_marginal_blockspec_bms::marginal")?;
    let marginal_dense =
        widen_marginal_dense_with_influence(&raw_marginal_dense, influence_columns)?;
    let logslope_dense = logslope_design
        .design
        .try_to_dense_arc("build_marginal_blockspec_bms::logslope")?;
    let callback: Arc<dyn BlockEffectiveJacobian> = Arc::new(BmsMarginalJacobian {
        marginal_dense: Arc::clone(&marginal_dense),
        logslope_dense,
        offset_m: offset_m.clone(),
        offset_s,
        p_marginal,
    });
    let (penalties, nullspace_dims, initial_log_lambdas) = marginal_penalties_with_influence_ridge(
        design,
        &rho,
        influence_columns,
        influence_ridge_log_lambda,
    )?;
    Ok(ParameterBlockSpec {
        name: "marginal_surface".to_string(),
        design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            (*marginal_dense).clone(),
        )),
        offset: offset_m,
        penalties,
        nullspace_dims,
        initial_log_lambdas,
        initial_beta: widen_marginal_beta_hint(beta_hint, p_marginal),
        // Canonical-gauge architecture (issue #322): give marginal_surface
        // strictly higher priority than logslope_surface so the priority-
        // ordered RRQR in `canonicalize_for_identifiability` presents
        // marginal columns first and routes any cross-block alias drop into
        // logslope.  Equal priorities (the previous default of 100/100)
        // produced a same-priority `hard_alias_pair` whenever a
        // high-dimensional smooth — e.g. `s(x, type=duchon, centers>=6)`
        // in the location block — accidentally spanned the logslope basis
        // direction, leaving the joint Hessian with a structural null and
        // the spectral Newton solve refusing to step.  The values mirror
        // the canonical-gauge entry for survival marginal-slope
        // (marginal=150, logslope=120).
        gauge_priority: 150,
        jacobian_callback: Some(callback),
        stacked_design: None,
        stacked_offset: None,
    })
}

fn build_logslope_blockspec_bms(
    design: &TermCollectionDesign,
    baseline: f64,
    offset: &Array1<f64>,
    rho: Array1<f64>,
    beta_hint: Option<Array1<f64>>,
    marginal_design: &TermCollectionDesign,
    marginal_offset: &Array1<f64>,
    marginal_baseline: f64,
    z: Arc<Array1<f64>>,
    p_marginal: usize,
    influence_columns: Option<&Array2<f64>>,
) -> Result<ParameterBlockSpec, String> {
    let offset_s = offset + baseline;
    let offset_m = marginal_offset + marginal_baseline;
    let raw_marginal_dense = marginal_design
        .design
        .try_to_dense_arc("build_logslope_blockspec_bms::marginal")?;
    // The logslope Jacobian reconstructs q_i = M·β_m + offset_m; with the
    // absorbed influence columns folded into the marginal index, the marginal
    // reference design and p_marginal MUST be the widened [M | Z̃] / (p_m+p₁)
    // so β_m slices to the absorber and q_i carries the Z̃·γ shift (#461).
    let marginal_dense =
        widen_marginal_dense_with_influence(&raw_marginal_dense, influence_columns)?;
    let logslope_dense = design
        .design
        .try_to_dense_arc("build_logslope_blockspec_bms::logslope")?;
    let callback: Arc<dyn BlockEffectiveJacobian> = Arc::new(BmsLogslopeJacobian {
        marginal_dense,
        logslope_dense: Arc::clone(&logslope_dense),
        offset_m,
        offset_s: offset_s.clone(),
        z,
        p_marginal,
    });
    Ok(ParameterBlockSpec {
        name: "logslope_surface".to_string(),
        design: DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from(
            (*logslope_dense).clone(),
        )),
        offset: offset_s,
        penalties: design.penalties_as_penalty_matrix(),
        nullspace_dims: design.nullspace_dims.clone(),
        initial_log_lambdas: rho,
        initial_beta: beta_hint,
        // Canonical-gauge architecture (issue #322): logslope is strictly
        // lower priority than marginal so the priority-ordered RRQR in
        // `canonicalize_for_identifiability` demotes a shared cross-block
        // direction here, not in marginal.  Mirrors the survival-mgs
        // value (marginal=150, logslope=120).  See the matching comment
        // on `build_marginal_blockspec_bms` for the failure mode this
        // resolves.
        gauge_priority: 120,
        jacobian_callback: Some(callback),
        stacked_design: None,
        stacked_offset: None,
    })
}

pub(crate) fn build_deviation_aux_blockspec(
    name: &str,
    prepared: &DeviationPrepared,
    rho: Array1<f64>,
    beta_hint: Option<Array1<f64>>,
) -> Result<ParameterBlockSpec, String> {
    let mut block = prepared.block.clone();
    block.initial_log_lambdas = Some(rho);
    let candidate_beta = beta_hint.or_else(|| Some(Array1::<f64>::zeros(block.design.ncols())));
    block.initial_beta = candidate_beta
        .map(|beta| {
            let zero = Array1::<f64>::zeros(beta.len());
            project_monotone_feasible_beta(&prepared.runtime, &zero, &beta, name)
        })
        .transpose()?;
    let mut spec = block.intospec(name)?;
    // Deviation auxiliary blocks (score_warp_dev, link_dev, and any
    // future flex block routed through this builder) model pure
    // shape modifications on top of parametric anchors. They must
    // never own a shared affine direction with the parametric
    // (time / marginal / logslope) blocks. The canonical-gauge
    // selector drops shared directions from blocks with lower
    // gauge_priority first; assigning a value below the parametric
    // default (100) realises that contract automatically.
    spec.gauge_priority = match name {
        "link_dev" => 60,
        // score_warp_dev gets a slightly higher priority than link_dev
        // because in mixed-flex configurations (both blocks present)
        // link_dev is the residualised one (orthogonalised against the
        // parametric anchors PLUS the already-prepared score_warp
        // basis at construction time); link_dev should therefore yield
        // first when an alias still survives into the joint design.
        "score_warp_dev" => 80,
        _ => 70,
    };
    Ok(spec)
}

pub(crate) fn push_deviation_aux_blockspecs(
    blocks: &mut Vec<ParameterBlockSpec>,
    rho: &Array1<f64>,
    cursor: &mut usize,
    score_warp_prepared: Option<&DeviationPrepared>,
    link_dev_prepared: Option<&DeviationPrepared>,
    score_warp_beta_hint: Option<Array1<f64>>,
    link_dev_beta_hint: Option<Array1<f64>>,
) -> Result<(), String> {
    if let Some(prepared) = score_warp_prepared {
        let rho_h = rho
            .slice(s![*cursor..*cursor + prepared.block.penalties.len()])
            .to_owned();
        *cursor += prepared.block.penalties.len();
        blocks.push(build_deviation_aux_blockspec(
            "score_warp_dev",
            prepared,
            rho_h,
            score_warp_beta_hint,
        )?);
    }
    if let Some(prepared) = link_dev_prepared {
        let rho_w = rho
            .slice(s![*cursor..*cursor + prepared.block.penalties.len()])
            .to_owned();
        blocks.push(build_deviation_aux_blockspec(
            "link_dev",
            prepared,
            rho_w,
            link_dev_beta_hint,
        )?);
    }
    Ok(())
}

fn inner_fit(
    family: &BernoulliMarginalSlopeFamily,
    blocks: &[ParameterBlockSpec],
    options: &BlockwiseFitOptions,
) -> Result<UnifiedFitResult, String> {
    let mut options = options.clone();
    // BMS carries fixed physical ridge penalties that regularize coefficient
    // geometry but are not REML coordinates. The exact hyper-Hessian route can
    // stall after that projection; the family has a dedicated exact-gradient
    // path with full-data polish, so make it the primary nested smoother.
    options.use_outer_hessian = false;
    options.outer_tol = options.outer_tol.max(2.0e-5);
    fit_custom_family(family, blocks, &options).map_err(|e| e.to_string())
}

pub fn fit_bernoulli_marginal_slope_terms(
    data: ArrayView2<'_, f64>,
    spec: BernoulliMarginalSlopeTermSpec,
    options: &BlockwiseFitOptions,
    kappa_options: &SpatialLengthScaleOptimizationOptions,
    policy: &crate::resource::ResourcePolicy,
) -> Result<BernoulliMarginalSlopeFitResult, String> {
    let mut spec = spec;
    let data_view = data;
    validate_spec(data_view, &spec)?;
    let mut effective_kappa_options = kappa_options.clone();
    // Honor explicit `length_scale=X` in the user's formula: when every
    // spatial term in BOTH the marginal mean and log-slope blocks carries
    // a user-supplied scalar length scale and no per-axis anisotropy is
    // requested, there is nothing for the joint-spatial outer optimizer
    // to do. Routing through it anyway spends ~80 outer ARC iters stalled
    // at the user's chosen ρ (the n-block ARC's first proposed step lands
    // at the box corner and never recovers), then falls through to the
    // ρ-only "custom family" path which is what we wanted all along.
    // Short-circuit straight to the ρ-only path.
    let kappa_locked_marginal = crate::smooth::all_spatial_terms_kappa_fixed(&spec.marginalspec);
    let kappa_locked_logslope = crate::smooth::all_spatial_terms_kappa_fixed(&spec.logslopespec);
    if effective_kappa_options.enabled && kappa_locked_marginal && kappa_locked_logslope {
        log::info!(
            "[BMS spatial] disabling κ/ψ optimization: every spatial term has an \
             explicit length_scale and no anisotropy; user-supplied kernel scale is fixed"
        );
        effective_kappa_options.enabled = false;
    }
    let flex_spatial_pilot_path = (spec.score_warp.is_some() || spec.link_dev.is_some())
        && spec.y.len() >= BMS_FLEX_SPATIAL_OUTER_PILOT_ROW_THRESHOLD
        && effective_kappa_options.enabled;
    if flex_spatial_pilot_path {
        let marginal_terms = spatial_length_scale_term_indices(&spec.marginalspec);
        let logslope_terms = spatial_length_scale_term_indices(&spec.logslopespec);
        let marginal_updates = apply_spatial_anisotropy_pilot_initializer(
            data_view,
            &mut spec.marginalspec,
            &marginal_terms,
            effective_kappa_options.pilot_subsample_threshold,
            &effective_kappa_options,
        );
        let logslope_updates = apply_spatial_anisotropy_pilot_initializer(
            data_view,
            &mut spec.logslopespec,
            &logslope_terms,
            effective_kappa_options.pilot_subsample_threshold,
            &effective_kappa_options,
        );
        effective_kappa_options.enabled = false;
        log::info!(
            "[BMS spatial] n={} flex=true pilot_geometry_updates={} iterative_spatial_outer=false reason=large-flex-spatial-pilot",
            spec.y.len(),
            marginal_updates + logslope_updates,
        );
    }
    let (z_standardized, z_normalization) = standardize_latent_z_with_policy(
        &spec.z,
        &spec.weights,
        "bernoulli-marginal-slope",
        &spec.latent_z_policy,
    )?;
    spec.z = z_standardized;
    let sigma_learnable = matches!(
        &spec.frailty,
        FrailtySpec::GaussianShift { sigma_fixed: None }
    );
    let initial_sigma = match &spec.frailty {
        FrailtySpec::GaussianShift {
            sigma_fixed: Some(s),
        } => Some(*s),
        FrailtySpec::GaussianShift { sigma_fixed: None } => Some(0.5),
        FrailtySpec::None => None,
        FrailtySpec::HazardMultiplier { .. } => {
            return Err(
                "internal: validate_spec should have rejected unsupported marginal-slope frailty"
                    .to_string(),
            );
        }
    };
    let probit_scale = probit_frailty_scale(initial_sigma);
    let (_raw_joint_designs, mut joint_specs) = build_term_collection_designs_and_freeze_joint(
        data_view,
        &[spec.marginalspec.clone(), spec.logslopespec.clone()],
    )
    .map_err(|e| e.to_string())?;
    let marginalspec_boot = joint_specs.remove(0);
    let logslopespec_boot = joint_specs.remove(0);
    // Rebuild the probe designs from the frozen `*_boot` specs so the probe's
    // penalty topology matches the topology produced by every other build path
    // in this optimization. The spatial optimizer's own bootstrap
    // (`build_term_collection_designs_and_freeze_joint(data, &[marginalspec_boot,
    // logslopespec_boot])` inside `optimize_spatial_length_scale_exact_joint`)
    // and every subsequent kappa-driven rebuild feed the basis builder the
    // captured `FrozenTransform` identifiability. Applying that captured
    // transform to the same kernel can land the structural null-space block on
    // the other side of `build_nullspace_shrinkage_penalty`'s spectral
    // tolerance, so the raw and frozen builds disagree on whether the trend
    // ridge survives as an active penalty candidate. Without this rebuild,
    // `marginal_penalty_count` / `logslope_design.penalties.len()` are taken
    // from the raw build but every subsequent evaluator measures the frozen
    // build, and `evaluate_custom_family_joint_hyper` refuses with a
    // "joint hyper rho dimension mismatch". Mirrors the CTN-side fix in
    // `fit_transformation_normal`.
    let (mut joint_designs, _) = build_term_collection_designs_and_freeze_joint(
        data_view,
        &[marginalspec_boot.clone(), logslopespec_boot.clone()],
    )
    .map_err(|e| format!("failed to rebuild frozen probe BMS joint designs: {e}"))?;
    let marginal_design = joint_designs.remove(0);
    let logslope_design = joint_designs.remove(0);
    let (latent_measure, latent_z_calibration) =
        build_latent_measure_with_geometry(&spec.z, &spec.weights, &spec.latent_z_policy)?;
    if latent_measure.is_empirical() && sigma_learnable {
        return Err("empirical latent-measure marginal-slope calibration requires fixed GaussianShift sigma; learnable sigma derivatives must be fit under the standard-normal latent measure"
                    .to_string());
    }

    let y = Arc::new(spec.y.clone());
    let weights = Arc::new(spec.weights.clone());
    // Apply rank-INT calibration to training z before any downstream
    // consumer (pooled probit baseline, term-collection designs, the
    // family's PIRLS loops) sees it. The calibration is persisted on the
    // fit result so prediction applies the identical monotone map.
    let z = match &latent_z_calibration {
        LatentMeasureCalibration::None => Arc::new(spec.z.clone()),
        LatentMeasureCalibration::RankInverseNormal(cal) => {
            Arc::new(cal.apply_to_training(&spec.z)?)
        }
    };
    let z_train = z.as_ref();
    let pilot_baseline = pooled_probit_baseline(&spec.y, z_train, &spec.weights)?;
    let baseline = (
        bernoulli_marginal_slope_eta_from_probability(
            &spec.base_link,
            normal_cdf(pilot_baseline.0),
            "bernoulli marginal-slope baseline link inversion",
        )?,
        pilot_baseline.1 / probit_scale,
    );

    // Score-warp basis construction is β-independent (identifiability is
    // provided by the smoothness-null-space drop on the basis transform,
    // not by a data-distribution moment anchor at the rigid-pilot η₀), so
    // the standard-normal and empirical latent-measure branches build the
    // same block. There is no row-weight pilot to thread into the basis;
    // the latent-measure split is enforced upstream via the empirical
    // intercept solve in `build_row_exact_context_with_stats`, not in the
    // deviation basis.
    // Score-warp basis is built first, then immediately reparameterised
    // against the parametric span (marginal + logslope columns at the n
    // training rows) so its column span is orthogonal to span(X_marginal,
    // X_logslope) by construction. This is the first half of the joint-
    // design identifiability invariant; the second half (link-deviation
    // orthogonalised against parametric + the now-reparameterised score-
    // warp) runs inside the link-deviation closure below. Together they
    // ensure `[X_marginal | X_logslope | Φ_score_warp · T_sw |
    // Φ_link_dev · T_lw]` has full numerical column rank, structurally
    // bounding `σ_min(joint H + S) ≥ λ_min(S) > 0` regardless of how β
    // drifts the linear predictor distribution during PIRLS.
    // Cross-block W-metric pilot. The joint penalised Hessian during PIRLS
    // uses the probit-style data Hessian row metric
    //
    //   W_pirls[i] = spec.weights[i] · φ(η_i)² / (μ_i·(1−μ_i))
    //
    // which is the canonical IRLS row weight. The cross-block
    // orthogonalisation below must use this metric (not uniform
    // spec.weights) so that `Aᵀ W C̃ = 0` holds in the same inner product
    // the joint Hessian sees — otherwise A and C̃ are merely Euclidean-
    // orthogonal, `Aᵀ W_pirls C̃ ≠ 0`, the joint Hessian carries a near-
    // null direction along the W-metric alias, and REML can drive the
    // flex block's λ small enough that the alias direction's joint
    // Hessian eigenvalue collapses. β then runs away along the alias
    // (manifest as `rho≈2.0`, constant `step_inf`, growing `beta_inf`
    // during PIRLS, and the inner solve hitting `inner_max_cycles`
    // without satisfying the KKT residual).
    //
    // Use the rigid pooled-probit pilot η for score-warp (its basis is
    // β-independent in z, so the rigid pilot suffices) and the one-GN-
    // stepped pilot η for link-deviation (its basis is evaluated at the
    // same eta_pilot used here, so the orthogonalisation metric matches
    // the basis evaluation point exactly). Both are β-independent so the
    // orthogonalisation remains a one-shot construction-time step.
    let rigid_pilot_eta = rigid_pooled_probit_pilot_eta(
        &spec.base_link,
        z_train,
        &spec.marginal_offset,
        &spec.logslope_offset,
        baseline.0,
        baseline.1,
        probit_scale,
    )?;
    let cross_block_pilot_w_score_warp =
        pilot_irls_hessian_row_metric_at_eta(&rigid_pilot_eta, &spec.weights);

    // Absorbed Stage-1 influence columns (#461, design §3). When the workflow
    // chained a CTN Stage-1 into this marginal-slope fit, `spec.score_influence_
    // jacobian` carries the out-of-fold `J = ∂z/∂θ₁`; the realized leakage
    // directions `Z_infl = diag(s_f·β̂₀)·J` are residualised against the
    // marginal span (logslope-aligned component retained) and appended to the
    // additive marginal-index block as a fixed-ridge absorber, so the joint
    // penalised solve makes the (α,β) score orthogonal to span(Z_infl) — the
    // x-dependent realisation of `ψ − Π_η[ψ]`. `None` ⇒ raw z, and the free
    // score_warp spline below is the x-free-column fallback. β̂₀(x_i) is the
    // rigid-pilot logslope `baseline.1 + logslope_offset[i]`; s_f = probit_scale.
    let influence_columns = if let Some(jac) = spec
        .score_influence_jacobian
        .as_ref()
        .filter(|j| j.ncols() > 0)
    {
        let marginal_dense_for_proj = marginal_design
            .design
            .try_to_dense_arc("bernoulli marginal-slope influence-block marginal projection")?;
        let marginal_dense = marginal_dense_for_proj.as_ref();
        if jac.nrows() != marginal_dense.nrows() {
            return Err(format!(
                "influence block: Jacobian has {} rows, marginal design has {}",
                jac.nrows(),
                marginal_dense.nrows()
            ));
        }
        // Z̃ = residualize(diag(s_f·β̂₀)·J) against the marginal span in the
        // rigid-pilot W-metric, via the SHARED core entry point (single source
        // of truth across bms + survival; the two families differ ONLY in how
        // they install the returned Z̃ — bms widens [M | Z̃], survival adds a
        // dedicated η₁ channel). β̂₀(x_i) = baseline.1 + logslope_offset[i];
        // s_f = probit_scale; W = the rigid-pilot PIRLS row metric.
        let rigid_logslope_at_rows = &spec.logslope_offset + baseline.1;
        let residualized =
            crate::families::marginal_slope_orthogonal::residualized_influence_block(
                jac,
                z_train,
                &rigid_logslope_at_rows,
                probit_scale,
                marginal_dense.view(),
                &cross_block_pilot_w_score_warp,
            )?;
        Some(residualized)
    } else {
        None
    };
    let mut cross_block_warnings: Vec<CrossBlockIdentifiabilityWarning> = Vec::new();
    let score_warp_prepared = if let Some(cfg) = spec.score_warp.as_ref() {
        use super::deviation_runtime::ParametricAnchorBlock;
        let mut prepared = build_score_warp_deviation_block_from_seed(z_train, cfg)?;
        // `install_compiled_flex_block_into_runtime` now delegates
        // its math body to `identifiability_compiler::compile` (commit
        // 4e20b8dc8); the prior Phase-4a shadow compile here was a
        // duplicate of that internal call and has been removed.
        let outcome = install_compiled_flex_block_into_runtime(
            &mut prepared,
            z_train,
            cfg,
            &[
                (&marginal_design.design, ParametricAnchorBlock::Marginal),
                (&logslope_design.design, ParametricAnchorBlock::Logslope),
            ],
            &[],
            &cross_block_pilot_w_score_warp,
        )?;
        match outcome {
            FlexCompileOutcome::Reparameterised => Some(prepared),
            FlexCompileOutcome::FullyAliased { reason } => {
                // Record via the structured channel. Keep the original
                // (non-compiled) design so the unified audit sees score_warp_dev
                // and attributes the drop via dropped_columns (gauge_priority=80
                // is below marginal=150 / logslope=120, so RRQR correctly
                // demotes score_warp_dev when it aliases those blocks).
                cross_block_warnings.push(CrossBlockIdentifiabilityWarning {
                    candidate_label: "score_warp",
                    anchor_summary: "marginal+logslope".to_string(),
                    reason,
                });
                Some(prepared)
            }
        }
    } else {
        None
    };
    // Build the link-deviation block. The basis lives in η-space, and at
    // PIRLS time `runtime.design(η_current)` is re-evaluated at the
    // current β-dependent η, so the basis is genuinely β-dependent during
    // optimisation. The construction-time seed is used only for (a) knot
    // placement in η-space and (b) the cross-block identifiability check
    // that computes the basis-space transform `T` orthogonalising the
    // candidate against the parametric and score-warp anchors at training
    // rows.
    //
    // Using the rigid pooled probit pilot directly (`q0 = a₀·√(…) + s_f·
    // b₀·z`) is structurally degenerate: with zero per-row offsets it is
    // affine in z, so a degree-3 I-spline of `q0` spans the same column
    // space at training rows as a degree-3 I-spline of z, and the cross-
    // block check finds the candidate fully aliased by the score-warp
    // anchor even though at any non-rigid β the link-deviation carries
    // PC/age structure the score-warp cannot represent.
    //
    // Instead, seed both knot placement and the orthogonalisation pivot at
    // a non-rigid pilot η computed via one probit Gauss-Newton step from
    // the rigid pilot onto the full marginal design (see
    // `pilot_eta_for_link_dev_orthogonalisation`). The pilot is row-varying
    // in PCs/age and the resulting `T` drops only directions aliased
    // across all β. The score-warp basis at training rows is also threaded
    // in as a flex anchor when active so the kept directions are jointly
    // orthogonal to parametric ⊕ score-warp.
    let link_dev_prepared = if let Some(cfg) = spec.link_dev.as_ref() {
        let eta_pilot = pilot_eta_for_link_dev_orthogonalisation(
            &spec.base_link,
            &spec.y,
            z_train,
            &spec.weights,
            &marginal_design.design,
            &spec.marginal_offset,
            &spec.logslope_offset,
            baseline.0,
            baseline.1,
            probit_scale,
        )?;
        let link_dev_seed = padded_deviation_seed(&eta_pilot, 1.0, 0.5);
        let mut prepared = build_link_deviation_block_from_knots_design_seed_and_weights(
            &link_dev_seed,
            &eta_pilot,
            cfg,
        )?;
        // Cross-block identifiability for the link-deviation basis. The
        // anchor union covers BOTH possible aliasing channels:
        //
        //  - Parametric: location and logslope designs evaluated at the n
        //    training rows. Columns of `Φ_link_dev(q0)` that reproduce
        //    parametric features become null-direction targets in the
        //    joint penalised Hessian since `S_link_dev` has no mass on
        //    them.
        //
        //  - Score-warp (when active): the now-reparameterised score-warp
        //    basis, also evaluated at training rows. Both flex bases are
        //    cubic I-spline cubic combinations of an η-pilot scalar, and
        //    even with each block's own smoothness-null-space drop their
        //    column spans can still overlap inside the orthogonal
        //    complement of `{1, η_pilot}`.
        //
        // After the orthogonalisation, `[X_marginal | X_logslope |
        // Φ_score_warp · T_sw | Φ_link_dev · T_lw]` has full numerical
        // column rank at training rows, so `σ_min(joint H+S) ≥ λ_min(S)
        // > 0` for every β. This is the standard GAM `gam.side`
        // convention generalised to multi-anchor unions (mgcv applies it
        // sequentially across smooths sharing a covariate).
        // When `install_compiled_flex_block_into_runtime`
        // reparameterised the score-warp runtime against the parametric
        // anchor union (marginal + logslope), it installed an
        // `anchor_residual` and cached the training-row parametric
        // anchor matrix on the runtime. `runtime.design()` on a
        // residualised runtime returns the *raw* basis evaluation,
        // which `assert`s the caller hasn't conflated with the
        // reparameterised basis — we want the reparameterised one
        // here, so go through `design_at_training_with_residual` so
        // the cached anchor rows are folded in. For score-warp
        // configurations where reparameterisation was a no-op (no
        // residual installed) the same call falls back to the raw
        // `design()` path, so the residual-vs-no-residual branches
        // converge on the right matrix.
        let score_warp_anchor_design = score_warp_prepared
            .as_ref()
            .map(|sw| sw.runtime.design_at_training_with_residual(z_train))
            .transpose()?;
        use super::deviation_runtime::ParametricAnchorBlock;
        let parametric_anchors: [(&DesignMatrix, ParametricAnchorBlock); 2] = [
            (&marginal_design.design, ParametricAnchorBlock::Marginal),
            (&logslope_design.design, ParametricAnchorBlock::Logslope),
        ];
        let flex_anchor_slot: Option<&Array2<f64>> = score_warp_anchor_design.as_ref();
        let flex_anchors: Vec<&Array2<f64>> = flex_anchor_slot.into_iter().collect();
        // W-metric for link-deviation orthogonalisation: same IRLS-style
        // probit Hessian row weight as the score-warp path, but evaluated at
        // `eta_pilot` (the one-GN-stepped pilot at which the link-dev basis
        // itself is anchored).
        let cross_block_pilot_w_link_dev =
            pilot_irls_hessian_row_metric_at_eta(&eta_pilot, &spec.weights);
        let outcome = install_compiled_flex_block_into_runtime(
            &mut prepared,
            &eta_pilot,
            cfg,
            &parametric_anchors,
            &flex_anchors,
            &cross_block_pilot_w_link_dev,
        )?;
        match outcome {
            FlexCompileOutcome::Reparameterised => Some(prepared),
            FlexCompileOutcome::FullyAliased { reason } => {
                // Record via the structured channel. Keep the original
                // (non-compiled) design so the unified audit sees link_dev
                // and attributes the drop via dropped_columns (gauge_priority=60
                // is below all parametric blocks so RRQR correctly demotes
                // link_dev when it aliases marginal / logslope / score_warp).
                cross_block_warnings.push(CrossBlockIdentifiabilityWarning {
                    candidate_label: "link_deviation",
                    anchor_summary: "marginal+logslope+score_warp".to_string(),
                    reason,
                });
                Some(prepared)
            }
        }
    } else {
        None
    };
    let extra_rho0 = {
        let mut out = Vec::new();
        if let Some(ref prepared) = score_warp_prepared {
            out.extend(std::iter::repeat_n(0.0, prepared.block.penalties.len()));
        }
        if let Some(ref prepared) = link_dev_prepared {
            out.extend(std::iter::repeat_n(0.0, prepared.block.penalties.len()));
        }
        out
    };
    // Reduced-basis orthogonalisation of the logslope design through the BMS
    // family's OWN internal `logslope_design` geometry (robust cure for the
    // marginal↔logslope structural confound). Robustness is unconditional, so we
    // always reparameterize the logslope coordinate space to a full-rank reduced
    // basis `T` whose effective weighted columns are W-orthogonal to the marginal
    // span at the rigid pilot — removing the rank-soft confounded direction the
    // former pinned overlap ridge merely penalised. The transform is
    // β/ρ-independent (pilot geometry only), so it is a one-shot construction-
    // time map applied to every per-iteration logslope design inside
    // `build_blocks` / `make_family`, and inverted at fit-result assembly so the
    // reported logslope β is in the original basis. `None` ⇒ nothing to reduce
    // (no rank-soft confounded direction) ⇒ raw design used everywhere.
    let logslope_reduced_reparam: Option<ReducedLogslopeReparam> = build_reduced_logslope_reparam(
        &marginal_design,
        &logslope_design,
        z.as_ref(),
        &cross_block_pilot_w_score_warp,
        &spec.marginal_offset,
        &spec.logslope_offset,
        baseline.0,
        baseline.1,
        probit_scale,
    )?;
    // Apply the reduced reparam to a logslope `TermCollectionDesign`, or return
    // the raw design clone when the reparam is absent (flag off / nothing to
    // reduce). Used by both `build_blocks` and `make_family` so the family's
    // internal design, the block design, β width, jacobian, penalty, and the
    // `validate_exact_block_state_shapes` check all agree at the reduced width.
    let reduce_logslope_design =
        |logslope_design: &TermCollectionDesign| -> Result<TermCollectionDesign, String> {
            match logslope_reduced_reparam.as_ref() {
                Some(reparam) => reparameterize_logslope_design_reduced(logslope_design, reparam),
                None => Ok(logslope_design.clone()),
            }
        };

    let marginal_penalty_count = marginal_design.penalties.len();
    let setup = joint_setup(
        data_view,
        &marginalspec_boot,
        &logslopespec_boot,
        marginal_penalty_count,
        logslope_design.penalties.len(),
        &extra_rho0,
        &effective_kappa_options,
    );
    let setup = if sigma_learnable {
        setup.with_auxiliary(
            Array1::from_vec(vec![initial_sigma.expect("learnable sigma seed").ln()]),
            Array1::from_vec(vec![0.01_f64.ln()]),
            Array1::from_vec(vec![5.0_f64.ln()]),
        )
    } else {
        setup
    };
    let final_sigma_cell = std::cell::Cell::new(initial_sigma);
    let exact_warm_start = RefCell::new(None::<CustomFamilyWarmStart>);
    let runaway_error = RefCell::new(None::<String>);
    // Outer ρ-cache β-seed staging slot. On a cache hit the spatial-joint
    // optimizer invokes `seed_inner_beta_fn` before the first eval at the
    // restored ρ: per-block column widths aren't known until the first
    // `build_blocks(rho, …)` runs, so we stash the flat β here and the eval
    // closures promote it into `exact_warm_start` (the slot the inner
    // PIRLS / Newton solve actually consumes) on their first invocation.
    let pending_beta_seed = RefCell::new(None::<Array1<f64>>);
    let hints = RefCell::new(ThetaHints::default());
    let score_warp_runtime = score_warp_prepared.as_ref().map(|p| p.runtime.clone());
    let link_dev_runtime = link_dev_prepared.as_ref().map(|p| p.runtime.clone());

    let build_blocks = |rho: &Array1<f64>,
                        marginal_design: &TermCollectionDesign,
                        logslope_design: &TermCollectionDesign|
     -> Result<Vec<ParameterBlockSpec>, String> {
        let hints = hints.borrow();
        let mut cursor = 0usize;
        // Reduced-basis orthogonalisation: replace the per-iteration logslope
        // design with its full-rank reduced reparameterization `G·T` (flag ON);
        // a no-op clone when off. The reduced design carries the SAME number of
        // penalties (each S → Tᵀ S T), so the `rho_logslope` slice width below
        // is unchanged. Every consumer (marginal jacobian's c_i, logslope
        // blockspec design/β/penalty/jacobian) now agrees at the reduced width.
        let logslope_design_reduced = reduce_logslope_design(logslope_design)?;
        let logslope_design = &logslope_design_reduced;
        // Fixed #754/#461 ridges are appended inside
        // `marginal_penalties_with_influence_ridge` as physical penalties and
        // are excluded from `rho`; only genuine REML-learned smooth penalties
        // appear in the spatial joint setup.
        let rho_marginal = rho
            .slice(s![cursor..cursor + marginal_design.penalties.len()])
            .to_owned();
        cursor += marginal_design.penalties.len();
        let rho_logslope = rho
            .slice(s![cursor..cursor + logslope_design.penalties.len()])
            .to_owned();
        cursor += logslope_design.penalties.len();
        let p_m = marginal_design.design.ncols()
            + influence_columns.as_ref().map(|z| z.ncols()).unwrap_or(0);
        let mut blocks = vec![
            build_marginal_blockspec_bms(
                marginal_design,
                baseline.0,
                &spec.marginal_offset,
                rho_marginal,
                hints.marginal_beta.clone(),
                logslope_design,
                &spec.logslope_offset,
                baseline.1,
                p_m,
                influence_columns.as_ref(),
                INFLUENCE_ABSORBER_FIXED_LOG_LAMBDA,
            )?,
            build_logslope_blockspec_bms(
                logslope_design,
                baseline.1,
                &spec.logslope_offset,
                rho_logslope,
                hints.logslope_beta.clone(),
                marginal_design,
                &spec.marginal_offset,
                baseline.0,
                Arc::clone(&z),
                p_m,
                influence_columns.as_ref(),
            )?,
        ];
        push_deviation_aux_blockspecs(
            &mut blocks,
            rho,
            &mut cursor,
            score_warp_prepared.as_ref(),
            link_dev_prepared.as_ref(),
            hints.score_warp_beta.clone(),
            hints.link_dev_beta.clone(),
        )?;
        Ok(blocks)
    };

    let intercept_warm_starts = new_intercept_warm_start_cache(y.len());
    let cell_moment_lru = new_cell_moment_lru_cache(policy);
    let cell_moment_cache_stats = new_cell_moment_cache_stats();
    let make_family = |marginal_design: &TermCollectionDesign,
                       logslope_design: &TermCollectionDesign,
                       sigma: Option<f64>|
     -> BernoulliMarginalSlopeFamily {
        // The kernel reads the marginal index from a matched (self.marginal_
        // design, β_m) pair. When the Stage-1 influence absorber is active the
        // marginal β is widened to [β_m; γ], so the family's marginal design
        // MUST be the widened [M | Z̃] for every per-row projection to slice
        // correctly (#461). With no absorber it is the raw design unchanged.
        let kernel_marginal_design = match influence_columns.as_ref() {
            Some(z_infl) => {
                let raw = marginal_design
                    .design
                    .try_to_dense_arc("make_family::widened-marginal")
                    .expect("dense marginal design for influence widening");
                let widened = widen_marginal_dense_with_influence(&raw, Some(z_infl))
                    .expect("widen marginal design with influence columns");
                DesignMatrix::Dense(crate::matrix::DenseDesignMatrix::from((*widened).clone()))
            }
            None => marginal_design.design.clone(),
        };
        // The family's row kernel reconstructs η_logslope = G·β_s and the
        // logslope Jacobian factor_i·G_i from this matched (logslope_design,
        // β_s) pair, so it MUST be the SAME reduced design `G·T` the block specs
        // fit against — otherwise β_s (reduced width) and the family design
        // (full width) desync. A no-op clone when the reparam is absent.
        let kernel_logslope_design = reduce_logslope_design(logslope_design)
            .expect("reduce logslope design for family construction")
            .design;
        BernoulliMarginalSlopeFamily {
            y: Arc::clone(&y),
            weights: Arc::clone(&weights),
            z: Arc::clone(&z),
            latent_measure: latent_measure.clone(),
            gaussian_frailty_sd: sigma,
            base_link: spec.base_link.clone(),
            marginal_design: kernel_marginal_design,
            logslope_design: kernel_logslope_design,
            score_warp: score_warp_runtime.clone(),
            link_dev: link_dev_runtime.clone(),
            policy: policy.clone(),
            cell_moment_lru: Arc::clone(&cell_moment_lru),
            cell_moment_cache_stats: Arc::clone(&cell_moment_cache_stats),
            intercept_warm_starts: Some(Arc::clone(&intercept_warm_starts)),
            auto_subsample_phase_counter: Arc::new(std::sync::atomic::AtomicUsize::new(0)),
            auto_subsample_last_rho: Arc::new(Mutex::new(None)),
        }
    };

    let marginal_terms = spatial_length_scale_term_indices(&marginalspec_boot);
    let logslope_terms = spatial_length_scale_term_indices(&logslopespec_boot);
    let marginal_has_spatial = !marginal_terms.is_empty();
    let logslope_has_spatial = !logslope_terms.is_empty();
    let analytic_joint_derivatives_available =
        marginal_has_spatial || logslope_has_spatial || setup.log_kappa_dim() == 0;
    if setup.log_kappa_dim() > 0 && !analytic_joint_derivatives_available {
        return Err("exact bernoulli marginal-slope spatial optimization requires analytic joint psi derivatives"
                    .to_string());
    }
    let initial_rho = setup.theta0().slice(s![..setup.rho_dim()]).to_owned();
    let initial_blocks = build_blocks(&initial_rho, &marginal_design, &logslope_design)?;
    let initial_family = make_family(&marginal_design, &logslope_design, initial_sigma);
    let (joint_gradient, joint_hessian) =
        custom_family_outer_derivatives(&initial_family, &initial_blocks, options);
    let analytic_joint_gradient_available = analytic_joint_derivatives_available
        && matches!(
            joint_gradient,
            crate::solver::outer_strategy::Derivative::Analytic
        );
    // Keep the analytic outer Hessian advertised at biobank scale. The
    // row-tensor terms below are represented through block-local
    // `HyperOperator`s and cached exact-Hessian workspaces, so ARC/trust-region
    // can consume exact HVPs without falling back to BFGS merely because the
    // realized problem is large.
    let analytic_joint_hessian_available =
        analytic_joint_derivatives_available && joint_hessian.is_analytic();
    let kappa_options_ref: &SpatialLengthScaleOptimizationOptions = &effective_kappa_options;
    let sigma_from_theta = |theta: &Array1<f64>| -> Option<f64> {
        if sigma_learnable {
            Some(theta[setup.rho_dim() + setup.log_kappa_dim()].exp())
        } else {
            initial_sigma
        }
    };
    let derivative_block_cache = RefCell::new(
        None::<(
            Array1<f64>,
            Arc<Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>>,
        )>,
    );
    let theta_matches = |left: &Array1<f64>, right: &Array1<f64>| -> bool {
        left.len() == right.len()
            && left
                .iter()
                .zip(right.iter())
                .all(|(lhs, rhs)| (*lhs - *rhs).abs() <= 1e-12 * (1.0 + lhs.abs().max(rhs.abs())))
    };
    let get_derivative_blocks = |theta: &Array1<f64>,
                                 specs: &[TermCollectionSpec],
                                 designs: &[TermCollectionDesign]|
     -> Result<
        Arc<Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>>,
        String,
    > {
        if let Some((cached_theta, cached_blocks)) = derivative_block_cache.borrow().as_ref()
            && theta_matches(cached_theta, theta)
        {
            return Ok(Arc::clone(cached_blocks));
        }

        let built = |specs: &[TermCollectionSpec],
                     designs: &[TermCollectionDesign]|
         -> Result<
            Vec<Vec<crate::custom_family::CustomFamilyBlockPsiDerivative>>,
            String,
        > {
            let marginal_psi_derivs = if marginal_has_spatial {
                build_block_spatial_psi_derivatives(data_view, &specs[0], &designs[0])?.ok_or_else(
                    || {
                        "bernoulli marginal-slope: marginal block has spatial terms \
                         but spatial psi derivatives are unavailable"
                            .to_string()
                    },
                )?
            } else {
                Vec::new()
            };
            let logslope_psi_derivs = if logslope_has_spatial {
                build_block_spatial_psi_derivatives(data_view, &specs[1], &designs[1])?.ok_or_else(
                    || {
                        "bernoulli marginal-slope: logslope block has spatial terms \
                         but spatial psi derivatives are unavailable"
                            .to_string()
                    },
                )?
            } else {
                Vec::new()
            };
            let mut derivative_blocks = vec![marginal_psi_derivs, logslope_psi_derivs];
            if score_warp_runtime.is_some() {
                derivative_blocks.push(Vec::new());
            }
            if link_dev_runtime.is_some() {
                derivative_blocks.push(Vec::new());
            }
            if sigma_learnable {
                derivative_blocks
                    .last_mut()
                    .expect("bernoulli derivative block list is non-empty")
                    .push(crate::custom_family::CustomFamilyBlockPsiDerivative::new(
                        None,
                        Array2::zeros((0, 0)),
                        Array2::zeros((0, 0)),
                        None,
                        None,
                        None,
                        None,
                    ));
            }
            Ok(derivative_blocks)
        }(specs, designs)?;
        let built = Arc::new(built);
        derivative_block_cache.replace(Some((theta.clone(), Arc::clone(&built))));
        Ok(built)
    };

    // Bernoulli marginal-slope is a multi-block family with β-dependent
    // joint Hessian: EFS/HybridEFS fixed-point structural invariant fails,
    // so we disable fixed-point at plan time rather than burning cycles on
    // a stalled first attempt that silently falls back.
    let outer_policy = {
        let psi_dim = setup.theta0().len() - setup.rho_dim();
        initial_family.outer_derivative_policy(&initial_blocks, psi_dim, options)
    };
    let exact_spatial_outer_tol = kappa_options_ref.rel_tol.max(1e-6);
    let solved = optimize_spatial_length_scale_exact_joint(
        data_view,
        &[marginalspec_boot.clone(), logslopespec_boot.clone()],
        &[marginal_terms.clone(), logslope_terms.clone()],
        kappa_options_ref,
        &setup,
        crate::seeding::SeedRiskProfile::GeneralizedLinear,
        analytic_joint_gradient_available,
        analytic_joint_hessian_available,
        true,
        None,
        outer_policy,
        |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
                return Err(err);
            }
            assert_eq!(
                specs.len(),
                designs.len(),
                "spatial joint optimizer must supply one spec per design",
            );
            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
            let sigma = sigma_from_theta(theta);
            final_sigma_cell.set(sigma);
            let family = make_family(&designs[0], &designs[1], sigma);
            let fit = inner_fit(&family, &blocks, options)?;
            if let Some(block) = fit.block_states.first()
                && let Some(err) = bernoulli_marginal_slope_runaway_error_from_argmax(
                    marginal_parametric_argmax_from_beta(&block.beta, &designs[0], &specs[0]),
                    marginal_full_argmax_from_beta(&block.beta, &designs[0]),
                    "the final inner solve produced a separation-scale coefficient",
                    "final fit",
                )
            {
                runaway_error.replace(Some(err.clone()));
                return Err(err);
            }
            let mut hints_mut = hints.borrow_mut();
            let mut bidx = 0usize;
            if let Some(block) = fit.block_states.get(bidx) {
                hints_mut.marginal_beta = Some(block.beta.clone());
            }
            bidx += 1;
            if let Some(block) = fit.block_states.get(bidx) {
                hints_mut.logslope_beta = Some(block.beta.clone());
            }
            bidx += 1;
            if score_warp_prepared.is_some() {
                if let Some(block) = fit.block_states.get(bidx) {
                    hints_mut.score_warp_beta = Some(block.beta.clone());
                }
                bidx += 1;
            }
            if link_dev_prepared.is_some()
                && let Some(block) = fit.block_states.get(bidx)
            {
                hints_mut.link_dev_beta = Some(block.beta.clone());
            }
            Ok(fit)
        },
        |theta,
         specs: &[TermCollectionSpec],
         designs: &[TermCollectionDesign],
         eval_mode,
         row_set: &crate::families::row_kernel::RowSet| {
            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
                return Err(err);
            }
            use crate::solver::estimate::reml::unified::EvalMode;
            let row_set_rows = match row_set {
                crate::families::row_kernel::RowSet::All => spec.y.len(),
                crate::families::row_kernel::RowSet::Subsample { rows, .. } => rows.len(),
            };
            log::debug!(
                "[BMS exact outer eval] mode={:?} row_set_rows={}",
                eval_mode,
                row_set_rows
            );
            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
            // Promote a staged β seed (deposited by the outer ρ-cache hit
            // before any eval ran) into the family warm-start slot now that
            // we know the per-block widths from the freshly built blocks.
            if let Some(beta_seed) = pending_beta_seed.borrow_mut().take() {
                let widths: Vec<usize> = blocks.iter().map(|b| b.design.ncols()).collect();
                match CustomFamilyWarmStart::from_cached_beta(&widths, &beta_seed) {
                    Ok(ws) => {
                        exact_warm_start.replace(Some(ws));
                    }
                    Err(e) => {
                        log::warn!(
                            "[BMS] outer ρ-cache β-warm-start rejected: {e}; falling back to cold β"
                        );
                    }
                }
            }
            let sigma = sigma_from_theta(theta);
            final_sigma_cell.set(sigma);
            let family = make_family(&designs[0], &designs[1], sigma);
            let derivative_blocks = get_derivative_blocks(theta, specs, designs)?;
            // Downgrade to ValueAndGradient when the caller asks for a
            // Hessian we can't provide; preserve ValueOnly probes for
            // line-search cost-only evaluation.
            let effective_mode = match eval_mode {
                EvalMode::ValueGradientHessian if !analytic_joint_hessian_available => {
                    EvalMode::ValueAndGradient
                }
                other => other,
            };
            let eval = evaluate_custom_family_joint_hyper_shared(
                &family,
                &blocks,
                &joint_hyper_options_for_outer_tolerance(options, exact_spatial_outer_tol),
                &rho,
                derivative_blocks,
                exact_warm_start.borrow().as_ref(),
                effective_mode,
            )?;
            if let Some(err) = bernoulli_marginal_slope_runaway_error(
                &eval.warm_start,
                &designs[0],
                &specs[0],
                eval.inner_converged,
                "exact outer evaluation",
            ) {
                runaway_error.replace(Some(err.clone()));
                return Err(err);
            }
            exact_warm_start.replace(Some(eval.warm_start.clone()));
            if !eval.inner_converged {
                return Err(
                    "exact bernoulli marginal-slope inner solve did not converge".to_string(),
                );
            }
            if matches!(eval_mode, EvalMode::ValueGradientHessian)
                && analytic_joint_hessian_available
                && !eval.outer_hessian.is_analytic()
            {
                return Err("exact bernoulli marginal-slope joint [rho, psi] objective did not return an outer Hessian"
                            .to_string());
            }
            Ok((eval.objective, eval.gradient, eval.outer_hessian))
        },
        |theta, specs: &[TermCollectionSpec], designs: &[TermCollectionDesign]| {
            if let Some(err) = runaway_error.borrow().as_ref().cloned() {
                return Err(err);
            }
            let rho = theta.slice(s![..setup.rho_dim()]).to_owned();
            let blocks = build_blocks(&rho, &designs[0], &designs[1])?;
            if let Some(beta_seed) = pending_beta_seed.borrow_mut().take() {
                let widths: Vec<usize> = blocks.iter().map(|b| b.design.ncols()).collect();
                match CustomFamilyWarmStart::from_cached_beta(&widths, &beta_seed) {
                    Ok(ws) => {
                        exact_warm_start.replace(Some(ws));
                    }
                    Err(e) => {
                        log::warn!(
                            "[BMS] outer ρ-cache β-warm-start rejected (efs): {e}; falling back to cold β"
                        );
                    }
                }
            }
            let sigma = sigma_from_theta(theta);
            final_sigma_cell.set(sigma);
            let family = make_family(&designs[0], &designs[1], sigma);
            let derivative_blocks = get_derivative_blocks(theta, specs, designs)?;
            let eval = evaluate_custom_family_joint_hyper_efs_shared(
                &family,
                &blocks,
                &joint_hyper_options_for_outer_tolerance(options, exact_spatial_outer_tol),
                &rho,
                derivative_blocks,
                exact_warm_start.borrow().as_ref(),
            )?;
            if let Some(err) = bernoulli_marginal_slope_runaway_error(
                &eval.warm_start,
                &designs[0],
                &specs[0],
                eval.inner_converged,
                "EFS outer evaluation",
            ) {
                runaway_error.replace(Some(err.clone()));
                return Err(err);
            }
            exact_warm_start.replace(Some(eval.warm_start.clone()));
            if !eval.inner_converged {
                return Err(
                    "exact bernoulli marginal-slope EFS inner solve did not converge".to_string(),
                );
            }
            Ok(eval.efs_eval)
        },
        crate::families::marginal_slope_shared::make_beta_seed_validator(&pending_beta_seed),
    )?;

    let mut resolved_specs = solved.resolved_specs;
    let mut designs = solved.designs;
    // Reduced-basis round-trip (robust cure). When the logslope design was
    // orthogonalised to a reduced basis `G·T`, the fitted logslope coefficient
    // `β'` lives in the reduced coordinates (width `r`). The returned
    // `logslope_design` / `logslopespec_resolved` are the ORIGINAL full-width
    // basis (prediction rebuilds full `G` from the resolved spec), so map the
    // reported logslope coefficients back to the original basis `β_logslope =
    // T·β'` (predictor-identical: `G·(T·β') = (G·T)·β'`). The marginal block,
    // aux blocks, and the internal reduced-width flat β/geometry are untouched;
    // only the per-block reported logslope coefficients (blocks[1] and
    // block_states[1]) — which prediction/reporting consume against the full
    // design — are lifted to full width.
    let mut solved_fit = solved.fit;
    if let Some(reparam) = logslope_reduced_reparam.as_ref() {
        let r = reparam.reduced_cols();
        if let Some(block) = solved_fit.blocks.get_mut(1)
            && block.beta.len() == r
        {
            block.beta = reparam.recover_original_logslope_beta(&block.beta)?;
        }
        if let Some(state) = solved_fit.block_states.get_mut(1)
            && state.beta.len() == r
        {
            state.beta = reparam.recover_original_logslope_beta(&state.beta)?;
        }
    }
    let latent_z_rank_int_calibration = match latent_z_calibration {
        LatentMeasureCalibration::None => None,
        LatentMeasureCalibration::RankInverseNormal(cal) => Some(cal),
    };
    // #461: PREDICT SEAM — when the Stage-1 influence absorber is active
    // (spec.score_influence_jacobian.is_some()), `fit.block_states[0].beta` is
    // the WIDENED marginal coefficient `[β_m; γ]` (length p_m + p₁), but
    // `marginal_design` below is the RAW term-collection design (p_m columns):
    // the absorbed influence columns Z̃_infl are a TRAINING-only leakage
    // absorber and do NOT exist at predict rows (no Stage-1 fold there). The
    // orthogonalized β̂_m is a property of the training fit, so prediction must
    // use ONLY the first p_m entries of block_states[0].beta against this raw
    // marginal_design and DROP the trailing γ. The model-payload / predict
    // builder (src/main.rs run_fit_bernoulli_marginal_slope → inference) owns
    // that truncation; it must record p_m (= marginal_design.design.ncols())
    // and slice the persisted marginal β to it. Survival mirrors this seam.
    Ok(BernoulliMarginalSlopeFitResult {
        fit: solved_fit,
        marginalspec_resolved: resolved_specs.remove(0),
        logslopespec_resolved: resolved_specs.remove(0),
        marginal_design: designs.remove(0),
        logslope_design: designs.remove(0),
        baseline_marginal: baseline.0,
        baseline_logslope: baseline.1,
        z_normalization,
        latent_measure,
        score_warp_runtime,
        link_dev_runtime,
        gaussian_frailty_sd: final_sigma_cell.get(),
        cross_block_warnings,
        latent_z_rank_int_calibration,
    })
}