1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
/// CLI argument definitions using clap derive macros.
use clap::{Args, Parser, Subcommand};
const CLI_VERSION: &str = env!("CARGO_PKG_VERSION");
#[derive(Parser)]
#[command(name = "ldsc", about = "LD Score Regression (Rust port)", version = CLI_VERSION)]
pub struct Cli {
/// Number of Rayon threads (global). Defaults to Rayon’s internal heuristic.
#[arg(long, global = true)]
pub rayon_threads: Option<usize>,
/// Number of Polars threads (global). Defaults to Polars’ internal heuristic.
#[arg(long, global = true)]
pub polars_threads: Option<usize>,
/// Logging level for Rust tracing (error, warn, info, debug, trace).
#[arg(long, default_value = "warn", global = true)]
pub log_level: String,
#[command(subcommand)]
pub command: Command,
}
#[derive(Subcommand)]
pub enum Command {
/// Pre-process GWAS summary statistics (replaces munge_sumstats.py)
MungeSumstats(MungeArgs),
/// Compute LD scores from PLINK binary files (matches Python --l2)
#[command(name = "l2")]
L2(L2Args),
/// Estimate SNP heritability (replaces --h2 in ldsc.py)
H2(H2Args),
/// Estimate genetic correlation (replaces --rg in ldsc.py)
Rg(RgArgs),
/// Generate annotation files from BED regions or gene sets (replaces make_annot.py)
MakeAnnot(MakeAnnotArgs),
/// Generate annotation files by binning continuous variables (Python --cts-bin)
CtsAnnot(CtsAnnotArgs),
}
// ---------------------------------------------------------------------------
// munge-sumstats
// ---------------------------------------------------------------------------
#[derive(Args)]
pub struct MungeArgs {
/// Input summary statistics file (TSV/CSV, optionally .gz or .bz2)
#[arg(long)]
pub sumstats: String,
/// Output file prefix
#[arg(long)]
pub out: String,
/// Optional: merge alleles reference file
#[arg(long)]
pub merge_alleles: Option<String>,
/// Parse Stephan Ripke's daner format (infer N_cas/N_con from FRQ_A_/FRQ_U_ headers).
#[arg(long, default_value_t = false)]
pub daner: bool,
/// Parse newer daner format (uses Nca/Nco columns for N_cas/N_con).
#[arg(long, default_value_t = false)]
pub daner_n: bool,
/// Minimum sample size to retain a SNP (0 → Python default: 90th percentile / 1.5)
#[arg(long, default_value_t = 0.0)]
pub n_min: f64,
/// Minimum MAF to retain a SNP (0 to disable)
#[arg(long = "maf-min", default_value_t = 0.01)]
pub maf: f64,
/// Minimum INFO score to retain a SNP (0 to disable)
#[arg(long, default_value_t = 0.9)]
pub info_min: f64,
/// Remove strand-ambiguous SNPs (A/T, C/G)
#[arg(long, default_value_t = true)]
pub keep_mhc: bool,
// --- Sample size overrides -----------------------------------------------
/// Fix sample size for all SNPs (overrides any N column in the file)
#[arg(long = "N")]
pub n: Option<f64>,
/// Number of cases (N = N-cas + N-con; ignored if --n is set)
#[arg(long = "N-cas")]
pub n_cas: Option<f64>,
/// Number of controls (N = N-cas + N-con; ignored if --n is set)
#[arg(long = "N-con")]
pub n_con: Option<f64>,
// --- Column name overrides (case-insensitive) ----------------------------
// Use when the file uses a non-standard name not in the built-in synonym map.
/// Name of the SNP ID column in the input file
#[arg(long = "snp")]
pub snp_col: Option<String>,
/// Name of the sample size (N) column in the input file
#[arg(long = "N-col")]
pub n_col: Option<String>,
/// Name of the case sample size column (N = N-cas-col + N-con-col per row)
#[arg(long = "N-cas-col")]
pub n_cas_col: Option<String>,
/// Name of the control sample size column (N = N-cas-col + N-con-col per row)
#[arg(long = "N-con-col")]
pub n_con_col: Option<String>,
/// Name of the effect allele (A1) column in the input file
#[arg(long = "a1")]
pub a1_col: Option<String>,
/// Name of the other allele (A2) column in the input file
#[arg(long = "a2")]
pub a2_col: Option<String>,
/// Name of the p-value column in the input file
#[arg(long = "p")]
pub p_col: Option<String>,
/// Name of the allele frequency column in the input file
#[arg(long = "frq")]
pub frq_col: Option<String>,
/// Name of the imputation INFO column in the input file
#[arg(long = "info")]
pub info_col: Option<String>,
/// Signed summary statistic column and its null value: COLNAME,null
/// (e.g., "Z,0" or "OR,1"). Used for P→Z conversion sign when the
/// column name is not in the built-in map.
#[arg(long)]
pub signed_sumstats: Option<String>,
/// Comma-separated list of column names to ignore (drop before any other processing).
/// Useful when a column would otherwise be misidentified as a signed stat.
#[arg(long)]
pub ignore: Option<String>,
/// Keep the allele frequency (MAF) column in the output file.
/// By default only SNP, A1, A2, Z, N are written.
#[arg(long, default_value_t = false)]
pub keep_maf: bool,
/// A1 is always the increasing allele — treat all Z-scores as positive when
/// deriving Z from P-values. Eliminates the need for a signed summary stat.
#[arg(long, default_value_t = false)]
pub a1_inc: bool,
/// Allow summary statistics files without allele columns (A1/A2).
/// Skips strand-ambiguity filtering; output will not include A1/A2.
#[arg(long, default_value_t = false)]
pub no_alleles: bool,
/// Comma-separated list of INFO column names (e.g. "INFO_EUR,INFO_EAS").
/// When provided, the per-SNP INFO filter uses the MEAN of all listed columns.
/// Takes precedence over the single --info-col / synonym-mapped INFO column.
#[arg(long)]
pub info_list: Option<String>,
/// Name of the study-count column (number of studies a SNP was genotyped in).
/// Used together with --nstudy-min to filter low-coverage SNPs.
#[arg(long)]
pub nstudy: Option<String>,
/// Minimum number of studies a SNP must be genotyped in (requires --nstudy).
#[arg(long)]
pub nstudy_min: Option<u64>,
}
// ---------------------------------------------------------------------------
// l2 (--l2)
// ---------------------------------------------------------------------------
#[derive(Args)]
pub struct L2Args {
/// PLINK binary file prefix
#[arg(long)]
pub bfile: String,
/// Output prefix
#[arg(long)]
pub out: String,
/// LD window in cM (default 1.0; mutually exclusive with --ld-wind-kb / --ld-wind-snps)
#[arg(long)]
pub ld_wind_cm: Option<f64>,
/// LD window in kilobases (overrides --ld-wind-cm when set)
#[arg(long)]
pub ld_wind_kb: Option<f64>,
/// LD window as number of flanking SNPs (overrides --ld-wind-cm when set)
#[arg(long = "ld-wind-snps")]
pub ld_wind_snp: Option<usize>,
/// Annotation file prefix for partitioned LD scores.
/// LDSC appends .annot, .annot.gz, or .annot.bz2 to this prefix.
/// The annot file must have the same SNPs in the same order as the .bim file.
#[arg(long)]
pub annot: Option<String>,
/// Compute partitioned LD scores by binning continuous annotations (Python --cts-bin).
/// Provide a single file or a comma-separated list of files.
#[arg(long)]
pub cts_bin: Option<String>,
/// Breakpoints for --cts-bin (comma-separated, use 'x' to separate files).
#[arg(long)]
pub cts_breaks: Option<String>,
/// Names for --cts-bin variables (comma-separated).
#[arg(long)]
pub cts_names: Option<String>,
/// The annot file contains only annotation columns (no CHR/SNP/BP/CM columns).
/// Use when working with "thin" annotation files.
#[arg(long, default_value_t = false)]
pub thin_annot: bool,
/// File containing SNP IDs (one per line) to include in LD score computation.
/// Only listed SNPs are used as targets and in windows; all others are excluded.
#[arg(long)]
pub extract: Option<String>,
/// Exclude SNPs with minor allele frequency below FLOAT from LD computation.
/// Applied after --extract; MAF is computed from the genotype data.
/// Default behavior applies the filter before LD computation (Python parity).
#[arg(long)]
pub maf: Option<f64>,
/// Apply --maf before LD computation (Python behavior). Slower but identical to Python.
/// Default: enabled.
#[arg(long, default_value_t = true)]
pub maf_pre: bool,
/// File containing SNP IDs (one per line) to print LD scores for.
/// Unlike --extract, all SNPs are still used in LD windows; only output is filtered.
#[arg(long)]
pub print_snps: Option<String>,
/// File of individual IDs to restrict the reference panel to (one FID IID per line).
/// Only individuals listed in this file are used for LD computation.
#[arg(long)]
pub keep: Option<String>,
/// Compute per-allele LD scores: weight each r² by p·(1−p) of the source SNP.
/// L2[i] = Σ_j r²_unbiased(i,j) × p_j·(1−p_j).
#[arg(long, default_value_t = false)]
pub per_allele: bool,
/// Generalized per-allele weighting: weight each r² by (p·(1−p))^S.
/// Equivalent to --per-allele when S=1; incompatible with --per-allele.
#[arg(long)]
pub pq_exp: Option<f64>,
/// Do not write the annot matrix produced by --cts-bin.
#[arg(long, default_value_t = false)]
pub no_print_annot: bool,
/// Number of SNPs to process per BLAS chunk (default 200).
/// Larger values change the window approximation slightly.
#[arg(long, default_value_t = 200)]
pub chunk_size: usize,
/// Allow whole-chromosome LD windows without warning.
#[arg(long, default_value_t = false)]
pub yes_really: bool,
}
// ---------------------------------------------------------------------------
// h2 (heritability)
// ---------------------------------------------------------------------------
#[derive(Args)]
pub struct H2Args {
/// Munged summary statistics file (.sumstats[.gz|.bz2])
#[arg(long, required_unless_present = "h2_cts", conflicts_with = "h2_cts")]
pub h2: Option<String>,
/// Cell-type specific analysis input (.sumstats[.gz|.bz2]) (Python --h2-cts).
#[arg(long, required_unless_present = "h2", conflicts_with = "h2")]
pub h2_cts: Option<String>,
/// LD score file prefix, per-chromosome (e.g. eas_ldscores/chr).
/// LDSC appends .l2.ldscore(.gz|.bz2) and the chromosome number.
/// Provide exactly one of --ref-ld-chr or --ref-ld.
#[arg(long)]
pub ref_ld_chr: Option<String>,
/// Single LD score file (non-chr-split alternative to --ref-ld-chr).
/// When using this option, supply --m-snps explicitly (no .l2.M files to read).
#[arg(long)]
pub ref_ld: Option<String>,
/// Regression weight LD score prefix (per-chromosome).
/// Provide exactly one of --w-ld-chr or --w-ld.
#[arg(long)]
pub w_ld_chr: Option<String>,
/// Cell-type-specific LD score list file (.ldcts): label + comma-separated prefixes.
#[arg(long)]
pub ref_ld_chr_cts: Option<String>,
/// Single regression weight LD score file (non-chr-split alternative to --w-ld-chr).
#[arg(long)]
pub w_ld: Option<String>,
/// Output prefix
#[arg(long)]
pub out: String,
/// Number of SNPs in the LD score reference panel (e.g. 1190321 for HapMap3 EUR).
/// Used as M in h2 = slope × M / N. If omitted, reads from .l2.M_5_50 files
/// alongside --ref-ld-chr; falls back to the number of regression SNPs if not found.
#[arg(long = "M")]
pub m_snps: Option<f64>,
/// Use .l2.M instead of .l2.M_5_50 as the M denominator.
/// Python LDSC defaults to .l2.M_5_50; set this flag to replicate --not-M-5-50.
#[arg(long, default_value_t = false)]
pub not_m_5_50: bool,
/// Sample prevalence (for liability-scale conversion; requires --pop-prev)
#[arg(long)]
pub samp_prev: Option<f64>,
/// Population prevalence (for liability-scale conversion; requires --samp-prev)
#[arg(long)]
pub pop_prev: Option<f64>,
/// Constrain the LD score regression intercept to 1 (for h2) rather than estimating it.
/// Equivalent to Python's --no-intercept.
#[arg(long, default_value_t = false)]
pub no_intercept: bool,
/// Fix the LD score regression intercept to a specific value (e.g. 1.02).
/// Takes precedence over --no-intercept when both are given.
#[arg(long)]
pub intercept_h2: Option<f64>,
/// Two-step estimator: use SNPs with chi² ≤ CHI2_MAX to estimate the intercept
/// in step 1, then fix the intercept and estimate h2 on all SNPs in step 2.
/// Recommended value: 30.
#[arg(long)]
pub two_step: Option<f64>,
/// Remove SNPs with chi² > CHISQ_MAX before regression (default: no filter).
#[arg(long)]
pub chisq_max: Option<f64>,
/// Number of jackknife blocks for SE estimation (default: 200)
#[arg(long, default_value_t = 200)]
pub n_blocks: usize,
/// Print per-annotation τ coefficients and enrichment alongside total h2.
/// Only meaningful when --ref-ld-chr points to partitioned (multi-column) LD score files.
#[arg(long, default_value_t = false)]
pub print_coefficients: bool,
/// For --h2-cts: report coefficients for all LD score sets in each line.
#[arg(long, default_value_t = false)]
pub print_all_cts: bool,
/// Treat partitioned annotations as overlapping categories (Python LDSC --overlap-annot).
/// Requires --frqfile/--frqfile-chr unless --not-m-5-50 is set.
#[arg(long, default_value_t = false)]
pub overlap_annot: bool,
/// Allele frequency file for overlapping annotations (single fileset).
/// Only used when --overlap-annot is set and --not-m-5-50 is false.
#[arg(long)]
pub frqfile: Option<String>,
/// Allele frequency files split per chromosome (prefix).
/// Only used when --overlap-annot is set and --not-m-5-50 is false.
#[arg(long)]
pub frqfile_chr: Option<String>,
/// Print jackknife covariance matrix of regression estimates.
#[arg(long, default_value_t = false)]
pub print_cov: bool,
/// Print per-block jackknife delete values for regression estimates.
#[arg(long, default_value_t = false)]
pub print_delete_vals: bool,
/// Allow h2 < 0 or |rg| > 1 in output (Rust never clips; accepted for CLI parity).
#[arg(long, default_value_t = false)]
pub return_silly_things: bool,
/// Force matrix inversion even when ill-conditioned (Rust uses least-squares solver;
/// accepted for CLI parity).
#[arg(long, default_value_t = false)]
pub invert_anyway: bool,
}
// ---------------------------------------------------------------------------
// rg (genetic correlation)
// ---------------------------------------------------------------------------
#[derive(Args)]
pub struct RgArgs {
/// Comma-separated list of .sumstats(.gz|.bz2) files
#[arg(long, value_delimiter = ',')]
pub rg: Vec<String>,
/// LD score file prefix (per-chromosome).
/// Provide exactly one of --ref-ld-chr or --ref-ld.
#[arg(long)]
pub ref_ld_chr: Option<String>,
/// Single LD score file (non-chr-split alternative to --ref-ld-chr).
#[arg(long)]
pub ref_ld: Option<String>,
/// Regression weight LD score prefix (per-chromosome).
/// Provide exactly one of --w-ld-chr or --w-ld.
#[arg(long)]
pub w_ld_chr: Option<String>,
/// Single regression weight LD score file (non-chr-split alternative to --w-ld-chr).
#[arg(long)]
pub w_ld: Option<String>,
/// Output prefix
#[arg(long)]
pub out: String,
/// Number of SNPs in the LD score reference panel.
/// If omitted, reads from .l2.M_5_50 files alongside --ref-ld-chr.
#[arg(long = "M")]
pub m_snps: Option<f64>,
/// Use .l2.M instead of .l2.M_5_50 as the M denominator.
#[arg(long, default_value_t = false)]
pub not_m_5_50: bool,
/// Constrain the genetic covariance intercept to 0 rather than estimating it.
#[arg(long, default_value_t = false)]
pub no_intercept: bool,
/// Two-step estimator for gencov: estimate intercept on SNPs with |Z1·Z2| ≤ CHI2_MAX,
/// then fix it and re-estimate slope on all SNPs.
#[arg(long)]
pub two_step: Option<f64>,
/// Remove SNPs with chi²₁·chi²₂ products above CHISQ_MAX before regression.
#[arg(long)]
pub chisq_max: Option<f64>,
/// Number of jackknife blocks for SE estimation (default: 200)
#[arg(long, default_value_t = 200)]
pub n_blocks: usize,
/// Sample prevalence for each trait (comma-separated), for liability-scale conversion.
/// Requires --pop-prev. Use one value per file in --rg.
#[arg(long, value_delimiter = ',')]
pub samp_prev: Vec<f64>,
/// Population prevalence for each trait (comma-separated).
/// Requires --samp-prev. Use one value per file in --rg.
#[arg(long, value_delimiter = ',')]
pub pop_prev: Vec<f64>,
/// Fix the gencov intercept to a specific value per trait-pair (comma-separated).
/// One value per pair in order: for traits A,B,C the pairs are A-B, B-C.
/// If fewer values than pairs are given, remaining pairs are freely estimated.
#[arg(long, value_delimiter = ',')]
pub intercept_gencov: Vec<f64>,
/// Fix per-trait h2 intercepts (comma-separated, one per trait in --rg order).
#[arg(long, value_delimiter = ',')]
pub intercept_h2: Vec<f64>,
/// Skip allele consistency checking between summary statistic files.
/// When set, skips allele alignment and mismatch filtering.
#[arg(long, default_value_t = false)]
pub no_check_alleles: bool,
/// Print jackknife covariance matrix of regression estimates.
#[arg(long, default_value_t = false)]
pub print_cov: bool,
/// Print per-block jackknife delete values for regression estimates.
#[arg(long, default_value_t = false)]
pub print_delete_vals: bool,
/// Allow |rg| > 1 or negative h2 in output (Rust never clips; accepted for CLI parity).
#[arg(long, default_value_t = false)]
pub return_silly_things: bool,
/// Force matrix inversion even when ill-conditioned (accepted for CLI parity).
#[arg(long, default_value_t = false)]
pub invert_anyway: bool,
}
// ---------------------------------------------------------------------------
// make-annot
// ---------------------------------------------------------------------------
#[derive(Args)]
pub struct MakeAnnotArgs {
/// PLINK .bim file listing the SNPs to annotate
#[arg(long)]
pub bimfile: String,
/// Output annotation file path (e.g., prefix.annot or prefix.annot.gz).
/// If the path ends in .gz, the output is gzip-compressed.
#[arg(long)]
pub annot_file: String,
/// UCSC BED file defining annotation regions (mutually exclusive with --gene-set-file).
/// Columns: chrom start(0-based) end(exclusive) [name …]
#[arg(long)]
pub bed_file: Option<String>,
/// Gene set file: one gene symbol per line.
/// Requires --gene-coord-file. SNPs within windowsize bp of each gene are annotated.
#[arg(long)]
pub gene_set_file: Option<String>,
/// Gene coordinate file: tab-separated with columns GENE CHR START END.
/// Requires --gene-set-file.
#[arg(long)]
pub gene_coord_file: Option<String>,
/// Window size in base pairs to add around each region (symmetric extension).
#[arg(long, default_value_t = 0)]
pub windowsize: u32,
/// Do not merge overlapping BED intervals before annotation.
/// By default adjacent/overlapping intervals are merged.
#[arg(long, default_value_t = false)]
pub nomerge: bool,
}
// ---------------------------------------------------------------------------
// cts-annot (continuous binning)
// ---------------------------------------------------------------------------
#[derive(Args)]
pub struct CtsAnnotArgs {
/// PLINK .bim file listing the SNPs to annotate
#[arg(long)]
pub bimfile: String,
/// Comma-separated list of CTS files (SNP + value; no header)
#[arg(long)]
pub cts_bin: String,
/// Breakpoints for each CTS file: comma-separated per file, joined by 'x'.
/// Use N to denote negative values (Python compatibility).
#[arg(long)]
pub cts_breaks: String,
/// Optional names for each CTS variable (comma-separated).
#[arg(long)]
pub cts_names: Option<String>,
/// Output annotation file path (e.g., prefix.annot or prefix.annot.gz).
#[arg(long)]
pub annot_file: String,
}