use std::collections::HashMap;
use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use clap::{Args, Parser, Subcommand};
use faer::Mat;
use gsem::io::gwas_reader;
use gsem::munge;
#[derive(Parser)]
#[command(
name = "gsem",
about = "Genomic Structural Equation Modeling",
long_about = "\
Genomic Structural Equation Modeling from GWAS summary statistics.
Pipeline: `munge` → `ldsc` (or `hdl`) → `commonfactor` / `usermodel` / `userGWAS` / ...
Use `gsem <subcommand> --help` to see a worked example for each subcommand
(use long `--help`, not short `-h`, to get the extended help with examples)."
)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
#[command(long_about = "\
QC and munge raw GWAS summary statistics against an HM3 SNPlist.
Writes one `<trait>.sumstats.gz` per input file, harmonising allele and
effect columns. Output feeds `gsem ldsc` and `gsem sumstats`.
EXAMPLE:
gsem munge \\
--files raw_T1.txt.gz raw_T2.txt.gz \\
--hm3 w_hm3.snplist \\
--trait-names T1 T2 \\
--N 1e5 1e5")]
Munge(MungeArgs),
#[command(long_about = "\
Estimate a genetic covariance matrix (and sampling-covariance matrix)
across munged GWAS summary statistics via LD score regression with a
block jackknife. Writes `ldsc.json` consumable by downstream subcommands
(`commonfactor`, `usermodel`, `userGWAS`, `commonfactorGWAS`, ...).
EXAMPLE:
gsem ldsc \\
--traits T1.sumstats.gz T2.sumstats.gz T3.sumstats.gz \\
--ld eur_w_ld_chr/ \\
--out ldsc.json")]
Ldsc(LdscArgs),
#[command(
name = "usermodel",
long_about = "\
Fit a user-specified SEM to the genetic covariance structure from `gsem
ldsc`. Writes parameter estimates and fit indices to a TSV.
EXAMPLE:
gsem ldsc --traits T1.sumstats.gz T2.sumstats.gz T3.sumstats.gz \\
--ld eur_w_ld_chr/ --out ldsc.json
gsem usermodel \\
--covstruc ldsc.json \\
--model 'F1 =~ NA*V1 + V2 + V3\\nF1 ~~ 1*F1' \\
--out fit.tsv"
)]
Sem(SemArgs),
#[command(long_about = "\
Merge per-trait sumstats into a single SNP x trait TSV for `userGWAS` /
`commonfactorGWAS`. Aligns alleles to a reference panel and applies QC
filters.
EXAMPLE:
gsem sumstats \\
--files T1.sumstats.gz T2.sumstats.gz \\
--ref eur_w_ld_chr/ \\
--trait-names T1 T2 \\
--out merged_sumstats.tsv")]
Sumstats(SumstatsArgs),
#[command(
name = "commonfactor",
long_about = "\
Auto-generate and fit a 1-factor CFA model to the genetic covariance
structure from `gsem ldsc`. Writes parameter estimates and fit indices
(chisq, df, p_chisq, AIC, CFI, SRMR) to a TSV.
EXAMPLE:
gsem ldsc --traits T1.sumstats.gz T2.sumstats.gz T3.sumstats.gz \\
--ld eur_w_ld_chr/ --out ldsc.json
gsem commonfactor --covstruc ldsc.json --out cf.tsv"
)]
CommonFactor(CommonFactorArgs),
#[command(
name = "userGWAS",
long_about = "\
Fit a user-specified SEM at every SNP in a merged sumstats file. Writes
two TSVs: a per-SNP table (metadata + fit stats) and a flat parameter
table (join on SNP).
EXAMPLE:
gsem sumstats --files T1.sumstats.gz T2.sumstats.gz T3.sumstats.gz \\
--ref eur_w_ld_chr/ --out merged_sumstats.tsv
gsem userGWAS \\
--covstruc ldsc.json \\
--snps merged_sumstats.tsv \\
--model 'F1 =~ NA*V1 + V2 + V3\\nF1 ~ SNP\\nF1 ~~ 1*F1' \\
--out-snps snps.tsv --out-params params.tsv"
)]
Gwas(GwasArgs),
#[command(
name = "commonfactorGWAS",
long_about = "\
Auto-generate a 1-factor model with SNP → F1 and fit it at every SNP.
WARNING: this subcommand uses an auto-generated fixed-variance
parameterisation that differs from R's `GenomicSEM::commonfactorGWAS`,
which uses a marker-indicator approach. Results are not bit-identical.
Set GSEMR_COMMONFACTOR_GWAS_QUIET=1 to suppress the first-use warning.
EXAMPLE:
gsem commonfactorGWAS \\
--covstruc ldsc.json \\
--snps merged_sumstats.tsv \\
--out-snps snps.tsv --out-params params.tsv"
)]
CommonfactorGwas(CommonfactorGwasArgs),
#[command(
name = "write.model",
long_about = "\
Convert a factor loading matrix (CSV/TSV) into lavaan-style model syntax.
Loadings below `--cutoff` are dropped; residual variances can be fixed
positive with `--fix-resid`.
EXAMPLE:
gsem write.model --loadings loadings.csv --cutoff 0.3"
)]
WriteModel(WriteModelArgs),
#[command(
name = "paLDSC",
long_about = "\
Compare observed eigenvalues of the genetic covariance matrix to the
95th percentile of eigenvalues simulated from sampling covariances, and
report the recommended number of non-spurious factors.
EXAMPLE:
gsem paLDSC --covstruc ldsc.json --r 500 --out pa.tsv"
)]
ParallelAnalysis(ParallelAnalysisArgs),
#[command(long_about = "\
Enrichment analysis stacking `gsem s_ldsc` output with a SEM model.
Returns per-annotation enrichment and SE.
EXAMPLE:
gsem s_ldsc --traits T1.sumstats.gz --ld baselineLD_v2.2/ \\
--wld weights_hm3_no_hla/ --frq 1000G_Phase3_frq/ \\
--out sldsc.json
gsem enrich --sldsc sldsc.json --model 'F1 =~ NA*V1' --out enr.tsv")]
Enrich(EnrichArgs),
#[command(
name = "s_ldsc",
long_about = "\
Partition heritability across functional annotations. Writes the per-
annotation S / V matrices and the `tau` table to a JSON bundle
consumable by `gsem enrich`.
EXAMPLE:
gsem s_ldsc \\
--traits T1.sumstats.gz \\
--ld baselineLD_v2.2/ \\
--wld weights_hm3_no_hla/ \\
--frq 1000G_Phase3_frq/ \\
--out sldsc.json"
)]
SLdsc(SLdscArgs),
#[command(long_about = "\
Fit a SEM and return the model-implied genetic correlation matrix plus
its sampling covariance. DWLS is the default estimator; pass `--ml` to
use ML instead.
EXAMPLE:
gsem rgmodel --covstruc ldsc.json --out rg.tsv")]
Rgmodel(RgmodelArgs),
#[command(
name = "multiSNP",
long_about = "\
Fit a SEM that regresses a common factor on multiple SNPs
simultaneously, accounting for LD between them.
EXAMPLE:
gsem multiSNP \\
--covstruc ldsc.json \\
--snps merged_sumstats.tsv \\
--snp-list rs1 rs2 rs3 \\
--model 'F1 =~ NA*V1 + V2\\nF1 ~ rs1 + rs2 + rs3\\nF1 ~~ 1*F1' \\
--out multi.tsv"
)]
MultiSnp(MultiSnpArgs),
#[command(
name = "simLDSC",
long_about = "\
Simulate multivariate GWAS summary statistics given a target genetic
covariance matrix. Writes `iter<r>GWAS<i>.sumstats.gz` for each
simulated trait (matching R GenomicSEM::simLDSC's file layout).
EXAMPLE:
gsem simLDSC \\
--covmat covmat.tsv \\
--N 1e5 1e5 \\
--ld eur_w_ld_chr/"
)]
Simulate(SimulateArgs),
#[command(long_about = "\
Run HDL (High-Definition Likelihood) estimation. Output shape is
identical to `gsem ldsc`, so downstream subcommands consume it the same
way.
EXAMPLE:
gsem hdl \\
--traits T1.sumstats.gz T2.sumstats.gz \\
--ld UKB_imputed_SVD_eigen99_extraction/ \\
--n-ref 335265 \\
--out hdl.json")]
Hdl(HdlArgs),
#[command(
name = "summaryGLS",
long_about = "\
GLS regression of a response vector on a predictor matrix given a
response covariance.
EXAMPLE:
gsem summaryGLS \\
--y y.tsv --v-y vy.tsv --predictors x.tsv --intercept \\
--out gls.tsv"
)]
SummaryGls(SummaryGlsArgs),
}
#[derive(Args)]
struct MungeArgs {
#[arg(short, long, num_args = 1..)]
files: Vec<PathBuf>,
#[arg(long)]
hm3: PathBuf,
#[arg(long, num_args = 1..)]
trait_names: Option<Vec<String>>,
#[arg(long, default_value = "0.9")]
info_filter: f64,
#[arg(long, default_value = "0.01")]
maf_filter: f64,
#[arg(short, long)]
n: Option<f64>,
#[arg(long)]
column_names: Option<String>,
#[arg(short, long, default_value = ".")]
out: PathBuf,
}
#[derive(Args)]
struct LdscArgs {
#[arg(short, long, num_args = 1..)]
traits: Vec<PathBuf>,
#[arg(long)]
sample_prev: Option<String>,
#[arg(long)]
pop_prev: Option<String>,
#[arg(long)]
ld: PathBuf,
#[arg(long)]
wld: Option<PathBuf>,
#[arg(long, default_value = "200")]
n_blocks: usize,
#[arg(long)]
chisq_max: Option<f64>,
#[arg(long, default_value = "22")]
chr: usize,
#[arg(long, default_value = "all")]
select: String,
#[arg(long)]
stand: bool,
#[arg(short, long, default_value = "ldsc_result.json")]
out: PathBuf,
}
#[derive(Args)]
struct SemArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long)]
model: Option<String>,
#[arg(long)]
model_file: Option<PathBuf>,
#[arg(long, default_value = "DWLS")]
estimation: String,
#[arg(long)]
std_lv: bool,
#[arg(long)]
fix_resid: bool,
#[arg(long)]
q_factor: bool,
#[arg(short, long, default_value = "sem_result.tsv")]
out: PathBuf,
}
#[derive(Args)]
struct SumstatsArgs {
#[arg(short, long, num_args = 1..)]
files: Vec<PathBuf>,
#[arg(long, name = "ref")]
ref_file: PathBuf,
#[arg(long, num_args = 1..)]
trait_names: Option<Vec<String>>,
#[arg(long, default_value = "0.6")]
info_filter: f64,
#[arg(long, default_value = "0.01")]
maf_filter: f64,
#[arg(long)]
keep_indel: bool,
#[arg(short, long, default_value = "merged_sumstats.tsv")]
out: PathBuf,
#[arg(long)]
threads: Option<usize>,
}
#[derive(Args)]
struct CommonFactorArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long, default_value = "DWLS")]
estimation: String,
#[arg(short, long, default_value = "commonfactor_result.tsv")]
out: PathBuf,
}
#[derive(Args)]
struct GwasArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long)]
sumstats: PathBuf,
#[arg(long)]
model: Option<String>,
#[arg(long)]
model_file: Option<PathBuf>,
#[arg(long, default_value = "DWLS")]
estimation: String,
#[arg(long, default_value = "standard")]
gc: String,
#[arg(long)]
threads: Option<usize>,
#[arg(long)]
std_lv: bool,
#[arg(long)]
sub: Option<String>,
#[arg(long)]
smooth_check: bool,
#[arg(long)]
q_snp: bool,
#[arg(long)]
fix_measurement: bool,
#[arg(long)]
twas: bool,
#[arg(short, long, default_value = "gwas_result.tsv")]
out: PathBuf,
#[arg(long)]
save_plot_manhattan: Option<PathBuf>,
#[arg(long)]
save_plot_qq: Option<PathBuf>,
}
#[derive(Args)]
struct CommonfactorGwasArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long)]
sumstats: PathBuf,
#[arg(long, default_value = "standard")]
gc: String,
#[arg(long)]
threads: Option<usize>,
#[arg(short, long, default_value = "commonfactor_gwas_result.tsv")]
out: PathBuf,
#[arg(long)]
save_plot_manhattan: Option<PathBuf>,
#[arg(long)]
save_plot_qq: Option<PathBuf>,
}
#[derive(Args)]
struct WriteModelArgs {
#[arg(long)]
loadings: PathBuf,
#[arg(long, num_args = 1..)]
names: Vec<String>,
#[arg(long, default_value = "0.3")]
cutoff: f64,
#[arg(long)]
fix_resid: bool,
#[arg(long)]
bifactor: bool,
#[arg(short, long, default_value = "model.txt")]
out: PathBuf,
}
#[derive(Args)]
struct ParallelAnalysisArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long, default_value = "500")]
n_sim: usize,
#[arg(short, long, default_value = "pa_result.tsv")]
out: PathBuf,
#[arg(long)]
save_plot: Option<PathBuf>,
}
#[derive(Args)]
struct EnrichArgs {
#[arg(long)]
input: PathBuf,
#[arg(short, long, default_value = "enrich_result.tsv")]
out: PathBuf,
}
#[derive(Args)]
struct SLdscArgs {
#[arg(short, long, num_args = 1..)]
traits: Vec<PathBuf>,
#[arg(long)]
sample_prev: Option<String>,
#[arg(long)]
pop_prev: Option<String>,
#[arg(long)]
ld: PathBuf,
#[arg(long)]
wld: Option<PathBuf>,
#[arg(long, default_value = "200")]
n_blocks: usize,
#[arg(long, default_value = "22")]
chr: usize,
#[arg(long, default_value = "all")]
select: String,
#[arg(short, long, default_value = "s_ldsc_result.json")]
out: PathBuf,
}
#[derive(Args)]
struct RgmodelArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long, default_value = "DWLS")]
estimation: String,
#[arg(long)]
model: Option<String>,
#[arg(long, default_value_t = false)]
std_lv: bool,
#[arg(short, long, default_value = "rgmodel_result.tsv")]
out: PathBuf,
}
#[derive(Args)]
struct MultiSnpArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long)]
sumstats: PathBuf,
#[arg(long)]
ld_matrix: PathBuf,
#[arg(long)]
model: Option<String>,
#[arg(long)]
model_file: Option<PathBuf>,
#[arg(long, default_value = "DWLS")]
estimation: String,
#[arg(short, long, default_value = "multi_snp_result.tsv")]
out: PathBuf,
}
#[derive(Args)]
struct SimulateArgs {
#[arg(long)]
covstruc: PathBuf,
#[arg(long)]
n_per_trait: String,
#[arg(long)]
ld: PathBuf,
#[arg(short, long, default_value = "simulated_sumstats.tsv")]
out: PathBuf,
}
#[derive(Args)]
struct HdlArgs {
#[arg(short, long, num_args = 1..)]
traits: Vec<PathBuf>,
#[arg(long)]
sample_prev: Option<String>,
#[arg(long)]
pop_prev: Option<String>,
#[arg(long)]
ld_path: PathBuf,
#[arg(long, default_value = "335265")]
n_ref: f64,
#[arg(long, default_value = "piecewise")]
method: String,
#[arg(short, long, default_value = "hdl_result.json")]
out: PathBuf,
}
#[derive(Args)]
struct SummaryGlsArgs {
#[arg(long)]
x: PathBuf,
#[arg(long)]
y: PathBuf,
#[arg(long)]
v: PathBuf,
#[arg(long, default_value = "true")]
intercept: bool,
#[arg(short, long, default_value = "gls_result.tsv")]
out: PathBuf,
}
fn main() -> Result<()> {
env_logger::init();
let cli = Cli::parse();
match cli.command {
Commands::Munge(args) => run_munge(args),
Commands::Ldsc(args) => run_ldsc(args),
Commands::Sem(args) => run_sem(args),
Commands::Sumstats(args) => run_sumstats(args),
Commands::CommonFactor(args) => run_commonfactor_cmd(args),
Commands::Gwas(args) => run_gwas(args),
Commands::CommonfactorGwas(args) => run_commonfactor_gwas(args),
Commands::WriteModel(args) => run_write_model(args),
Commands::ParallelAnalysis(args) => run_parallel_analysis(args),
Commands::SLdsc(args) => run_s_ldsc(args),
Commands::Enrich(args) => run_enrich(args),
Commands::Rgmodel(args) => run_rgmodel_cmd(args),
Commands::MultiSnp(args) => run_multi_snp_cmd(args),
Commands::Simulate(args) => run_simulate(args),
Commands::Hdl(args) => run_hdl(args),
Commands::SummaryGls(args) => run_summary_gls(args),
}
}
fn run_munge(args: MungeArgs) -> Result<()> {
eprintln!("Reading reference panel: {}", args.hm3.display());
let reference = munge::read_reference(&args.hm3).context("failed to read HapMap3 reference")?;
eprintln!("Loaded {} reference SNPs", reference.len());
let column_overrides = args.column_names.map(|s| {
s.split(',')
.filter_map(|pair| {
let mut parts = pair.splitn(2, '=');
let key = parts.next()?.trim().to_string();
let value = parts.next()?.trim().to_string();
if key.is_empty() || value.is_empty() {
None
} else {
Some((key, value))
}
})
.collect::<HashMap<String, String>>()
});
let config = munge::MungeConfig {
info_filter: args.info_filter,
maf_filter: args.maf_filter,
n_override: args.n,
column_overrides,
};
let trait_names = args.trait_names.as_deref();
for (i, file) in args.files.iter().enumerate() {
let trait_name = trait_names
.and_then(|names| names.get(i))
.map(|s| s.as_str())
.or_else(|| file.file_stem().and_then(|s| s.to_str()))
.unwrap_or("trait");
let out_path = args.out.join(format!("{trait_name}.sumstats.gz"));
eprintln!("Munging: {} -> {}", file.display(), out_path.display());
munge::munge_and_write(file, &reference, &config, &out_path)
.with_context(|| format!("failed to munge {}", file.display()))?;
}
eprintln!("Done.");
Ok(())
}
fn run_sumstats(args: SumstatsArgs) -> Result<()> {
let names: Vec<String> = args.trait_names.unwrap_or_else(|| {
args.files
.iter()
.map(|f| {
f.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("trait")
.to_string()
})
.collect()
});
let k = args.files.len();
let config = gsem::sumstats::SumstatsConfig {
info_filter: args.info_filter,
maf_filter: args.maf_filter,
n_overrides: vec![None; k],
se_logit: vec![false; k],
ols: vec![false; k],
linprob: vec![false; k],
keep_indel: args.keep_indel,
keep_ambig: false,
beta_overrides: Vec::new(),
direct_filter: false,
num_threads: args.threads,
};
let file_refs: Vec<&Path> = args.files.iter().map(|p| p.as_path()).collect();
eprintln!("Merging {} GWAS files...", k);
gsem::sumstats::merge_sumstats(&file_refs, &args.ref_file, &names, &config, &args.out)?;
eprintln!("Done.");
Ok(())
}
fn run_ldsc(args: LdscArgs) -> Result<()> {
eprintln!("Reading {} trait files...", args.traits.len());
let mut trait_data = Vec::new();
for path in &args.traits {
let records = gwas_reader::read_sumstats(path)
.with_context(|| format!("failed to read {}", path.display()))?;
let n_snps = records.len();
eprintln!(" {}: {} SNPs", path.display(), n_snps);
trait_data.push(gsem_ldsc::TraitSumstats {
snp: records.iter().map(|r| r.snp.clone()).collect(),
z: records.iter().map(|r| r.z).collect(),
n: records.iter().map(|r| r.n).collect(),
a1: records.iter().map(|r| r.a1.clone()).collect(),
a2: records.iter().map(|r| r.a2.clone()).collect(),
});
}
let wld_dir = args.wld.as_deref().unwrap_or(&args.ld);
let chromosomes = parse_chromosome_selection(&args.select, args.chr);
let ld_data = gsem::io::ld_reader::read_ld_scores(&args.ld, wld_dir, &chromosomes)
.context("failed to read LD scores")?;
eprintln!(
"Loaded {} LD score SNPs, M={}",
ld_data.records.len(),
ld_data.total_m
);
let k = args.traits.len();
let sp = parse_prevalences(&args.sample_prev, k);
let pp = parse_prevalences(&args.pop_prev, k);
let ld_snps: Vec<String> = ld_data.records.iter().map(|r| r.snp.clone()).collect();
let ld_scores: Vec<f64> = ld_data.records.iter().map(|r| r.l2).collect();
let config = gsem_ldsc::LdscConfig {
n_blocks: args.n_blocks,
chisq_max: args.chisq_max,
num_threads: None,
};
let n_pairs = k * (k + 1) / 2;
let pb = indicatif::ProgressBar::new(n_pairs as u64);
pb.set_style(
indicatif::ProgressStyle::with_template(
"[{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} trait pairs ({eta})",
)
.unwrap(),
);
let result = gsem_ldsc::ldsc(
&trait_data,
&sp,
&pp,
&ld_scores,
&ld_data.w_ld,
&ld_snps,
ld_data.total_m,
&config,
Some(&|| pb.inc(1)),
)?;
pb.finish_with_message("complete");
let json = result.to_json_string()?;
std::fs::write(&args.out, &json)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("LDSC complete. Results written to {}", args.out.display());
let k = result.s.nrows();
eprintln!("\nGenetic covariance matrix (S):");
for i in 0..k {
let row: Vec<String> = (0..k)
.map(|j| format!("{:8.4}", result.s[(i, j)]))
.collect();
eprintln!(" {}", row.join(" "));
}
eprintln!("\nIntercepts (I):");
for i in 0..k {
let row: Vec<String> = (0..k)
.map(|j| format!("{:8.4}", result.i_mat[(i, j)]))
.collect();
eprintln!(" {}", row.join(" "));
}
if args.stand {
eprintln!("\nStandardized genetic correlation matrix (S_Stand):");
for i in 0..k {
let row: Vec<String> = (0..k)
.map(|j| {
let d_i = result.s[(i, i)].sqrt();
let d_j = result.s[(j, j)].sqrt();
if d_i > 0.0 && d_j > 0.0 {
format!("{:8.4}", result.s[(i, j)] / (d_i * d_j))
} else {
format!("{:8.4}", 0.0)
}
})
.collect();
eprintln!(" {}", row.join(" "));
}
}
Ok(())
}
fn run_s_ldsc(args: SLdscArgs) -> Result<()> {
eprintln!("Reading {} trait files...", args.traits.len());
let mut trait_data = Vec::new();
for path in &args.traits {
let records = gwas_reader::read_sumstats(path)
.with_context(|| format!("failed to read {}", path.display()))?;
let n_snps = records.len();
eprintln!(" {}: {} SNPs", path.display(), n_snps);
trait_data.push(gsem_ldsc::TraitSumstats {
snp: records.iter().map(|r| r.snp.clone()).collect(),
z: records.iter().map(|r| r.z).collect(),
n: records.iter().map(|r| r.n).collect(),
a1: records.iter().map(|r| r.a1.clone()).collect(),
a2: records.iter().map(|r| r.a2.clone()).collect(),
});
}
let wld_dir = args.wld.as_deref().unwrap_or(&args.ld);
let chromosomes = parse_chromosome_selection(&args.select, args.chr);
let annot_data = gsem_ldsc::annot_reader::read_annot_ld_scores(&args.ld, wld_dir, &chromosomes)
.context("failed to read annotation LD scores")?;
eprintln!(
"Loaded {} LD score SNPs, {} annotations, M_total={}",
annot_data.snps.len(),
annot_data.annotation_names.len(),
annot_data.m_annot.iter().sum::<f64>()
);
let k = args.traits.len();
let sp = parse_prevalences(&args.sample_prev, k);
let pp = parse_prevalences(&args.pop_prev, k);
let config = gsem_ldsc::stratified::StratifiedLdscConfig {
n_blocks: args.n_blocks,
rm_flank: false,
flank_kb: 500,
};
eprintln!("Running stratified LDSC...");
let result = gsem_ldsc::stratified::s_ldsc(
&trait_data,
&sp,
&pp,
&annot_data.annot_ld,
&annot_data.w_ld,
&annot_data.snps,
&annot_data.annotation_names,
&annot_data.m_annot,
&config,
Some(&annot_data.chr),
Some(&annot_data.bp),
)?;
let json = result.to_json_string()?;
std::fs::write(&args.out, &json)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!(
"Stratified LDSC complete. Results written to {}",
args.out.display()
);
let n_annot = result.annotations.len();
eprintln!("\nAnnotations ({n_annot}):");
for (a, name) in result.annotations.iter().enumerate() {
let h2_diag: Vec<String> = (0..result.s_annot[a].nrows())
.map(|i| format!("{:.6}", result.s_annot[a][(i, i)]))
.collect();
eprintln!(
" {name}: M={:.0}, prop={:.4}, h2_diag=[{}]",
result.m_annot[a],
result.prop[a],
h2_diag.join(", ")
);
}
eprintln!("\nIntercepts (I):");
let ki = result.i_mat.nrows();
for i in 0..ki {
let row: Vec<String> = (0..ki)
.map(|j| format!("{:8.4}", result.i_mat[(i, j)]))
.collect();
eprintln!(" {}", row.join(" "));
}
Ok(())
}
fn run_sem(args: SemArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let model_str = if let Some(m) = args.model {
m
} else if let Some(f) = args.model_file {
std::fs::read_to_string(&f).with_context(|| format!("failed to read {}", f.display()))?
} else {
anyhow::bail!("must provide --model or --model-file");
};
let estimation = gsem_sem::EstimationMethod::from_str_lossy(&args.estimation);
let std_lv = args.std_lv;
eprintln!("Fitting SEM model (estimation={estimation}, std_lv={std_lv})...");
let mut pt =
gsem_sem::syntax::parse_model(&model_str, std_lv).map_err(|e| anyhow::anyhow!("{e}"))?;
let k = ldsc_result.s.nrows();
let obs_names: Vec<String> = (0..k).map(|i| format!("V{}", i + 1)).collect();
let mut sem_model = gsem_sem::model::Model::from_partable(&pt, &obs_names);
let kstar = k * (k + 1) / 2;
let v_diag: Vec<f64> = (0..kstar).map(|i| ldsc_result.v[(i, i)]).collect();
let fit = match estimation {
gsem_sem::EstimationMethod::Ml => {
gsem_sem::estimator::fit_ml(&mut sem_model, &ldsc_result.s, 1000, None)
}
gsem_sem::EstimationMethod::Dwls => {
gsem_sem::estimator::fit_dwls(&mut sem_model, &ldsc_result.s, &v_diag, 1000, None)
}
};
let fit = if !fit.converged && args.fix_resid {
eprintln!(
"Model did not converge; retrying with residual variance lower bounds (fix_resid)..."
);
for row in &mut pt.rows {
if row.op == gsem_sem::syntax::Op::Covariance
&& row.lhs == row.rhs
&& row.free > 0
&& row.lower_bound.is_none()
{
row.lower_bound = Some(0.0001);
}
}
sem_model = gsem_sem::model::Model::from_partable(&pt, &obs_names);
match estimation {
gsem_sem::EstimationMethod::Ml => {
gsem_sem::estimator::fit_ml(&mut sem_model, &ldsc_result.s, 1000, None)
}
gsem_sem::EstimationMethod::Dwls => {
gsem_sem::estimator::fit_dwls(&mut sem_model, &ldsc_result.s, &v_diag, 1000, None)
}
}
} else {
fit
};
eprintln!("Converged: {}", fit.converged);
eprintln!("Objective: {:.6}", fit.objective);
let mut output = String::from("lhs\top\trhs\test\n");
let free_rows: Vec<_> = pt.rows.iter().filter(|r| r.free > 0).collect();
for (i, row) in free_rows.iter().enumerate() {
let est = fit.params.get(i).copied().unwrap_or(0.0);
output.push_str(&format!(
"{}\t{}\t{}\t{:.6}\n",
row.lhs, row.op, row.rhs, est
));
}
if args.q_factor {
let sigma_hat = sem_model.implied_cov();
let fi = gsem_sem::q_factor::factor_indicators(&pt, &obs_names);
if fi.len() >= 2 {
let q_results = gsem_sem::q_factor::compute_q_factor(
&ldsc_result.s,
&sigma_hat,
&ldsc_result.v,
&fi,
)?;
if !q_results.is_empty() {
output.push_str("\n# Q_Factor heterogeneity test\n");
output.push_str("factor1\tfactor2\tQ_chisq\tQ_df\tQ_pval\n");
for qr in &q_results {
output.push_str(&format!(
"{}\t{}\t{:.6}\t{}\t{:.6e}\n",
qr.factor1, qr.factor2, qr.q_chisq, qr.q_df, qr.q_p
));
eprintln!(
"Q_Factor({}, {}): chisq={:.4}, df={}, p={:.4e}",
qr.factor1, qr.factor2, qr.q_chisq, qr.q_df, qr.q_p
);
}
}
} else {
eprintln!("Q_Factor: skipped (fewer than 2 factors in model)");
}
}
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Results written to {}", args.out.display());
Ok(())
}
fn run_gwas(args: GwasArgs) -> Result<()> {
if let Some(t) = args.threads {
rayon::ThreadPoolBuilder::new()
.num_threads(t)
.build_global()
.context("failed to initialize thread pool")?;
}
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let model_str = if let Some(m) = args.model {
m
} else if let Some(f) = args.model_file {
std::fs::read_to_string(&f)
.with_context(|| format!("failed to read model file {}", f.display()))?
} else {
anyhow::bail!("must provide --model or --model-file");
};
let gc_mode: gsem::gwas::gc_correction::GcMode = args.gc.parse().expect("infallible");
let k = ldsc_result.s.nrows();
let mut i_ld = ldsc_result.i_mat.to_owned();
for i in 0..k {
if i_ld[(i, i)] < 1.0 {
i_ld[(i, i)] = 1.0;
}
}
let sub_filters: Option<Vec<String>> = args.sub.map(|s| {
s.split(',')
.map(|entry| entry.trim().to_string())
.filter(|entry| !entry.is_empty())
.collect()
});
let estimation = gsem_sem::EstimationMethod::from_str_lossy(&args.estimation);
if args.twas {
run_gwas_twas(
&args.sumstats,
&model_str,
estimation,
&args.gc,
gc_mode,
k,
args.std_lv,
args.smooth_check,
&ldsc_result,
&i_ld,
&sub_filters,
&args.out,
)
} else {
run_gwas_snp(
&args.sumstats,
&model_str,
estimation,
&args.gc,
gc_mode,
k,
args.std_lv,
args.smooth_check,
&ldsc_result,
&i_ld,
&sub_filters,
&args.out,
args.save_plot_manhattan.as_deref(),
args.save_plot_qq.as_deref(),
)
}
}
#[allow(clippy::too_many_arguments)]
fn run_gwas_snp(
sumstats: &Path,
model_str: &str,
estimation: gsem_sem::EstimationMethod,
gc: &str,
gc_mode: gsem::gwas::gc_correction::GcMode,
k: usize,
std_lv: bool,
smooth_check: bool,
ldsc_result: &gsem_ldsc::LdscResult,
i_ld: &Mat<f64>,
sub_filters: &Option<Vec<String>>,
out: &Path,
save_plot_manhattan: Option<&Path>,
save_plot_qq: Option<&Path>,
) -> Result<()> {
eprintln!("Reading merged sumstats: {}", sumstats.display());
let merged = gsem::io::sumstats_reader::read_merged_sumstats(sumstats)
.with_context(|| format!("failed to read {}", sumstats.display()))?;
let n_snps = merged.len();
eprintln!("GWAS: {n_snps} SNPs, {k} traits, estimation={estimation}, gc={gc}");
if merged.trait_names.len() != k {
anyhow::bail!(
"trait count mismatch: LDSC has {k} traits, sumstats has {} (beta.* columns)",
merged.trait_names.len()
);
}
let beta_snp = merged.beta_rows();
let se_snp = merged.se_rows();
let var_snp = merged.var_snp();
let model = gsem_sem::syntax::parse_model(model_str, std_lv)
.map_err(|e| anyhow::anyhow!("model parse error: {e}"))?;
let config = gsem::gwas::user_gwas::UserGwasConfig {
model,
estimation,
gc: gc_mode,
max_iter: 500,
smooth_check,
snp_se: None,
variant_label: gsem::gwas::user_gwas::VariantLabel::Snp,
q_snp: false,
fix_measurement: true,
num_threads: None,
};
eprintln!("Running GWAS across {n_snps} SNPs...");
let pb = indicatif::ProgressBar::new(n_snps as u64);
pb.set_style(
indicatif::ProgressStyle::with_template(
"[{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} SNPs ({eta})",
)
.unwrap(),
);
let results = gsem::gwas::user_gwas::run_user_gwas(
&config,
&ldsc_result.s,
&ldsc_result.v,
i_ld,
&beta_snp,
&se_snp,
&var_snp,
Some(&|| pb.inc(1)),
);
pb.finish_with_message("complete");
let mut output = String::from("SNP\tlhs\top\trhs\test\tse\tz\tp\tchisq\tdf\tconverged\n");
for snp_result in &results {
let snp_name = &merged.snp[snp_result.snp_idx];
for param in &snp_result.params {
if let Some(filters) = sub_filters {
let param_key = format!("{}{}{}", param.lhs, param.op, param.rhs);
if !filters.iter().any(|f| f == ¶m_key) {
continue;
}
}
output.push_str(&format!(
"{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\t{:.4}\t{}\t{}\n",
snp_name,
param.lhs,
param.op,
param.rhs,
param.est,
param.se,
param.z_stat,
param.p_value,
snp_result.chisq,
snp_result.chisq_df,
snp_result.converged,
));
}
}
std::fs::write(out, &output).with_context(|| format!("failed to write {}", out.display()))?;
let n_converged = results.iter().filter(|r| r.converged).count();
eprintln!(
"GWAS complete: {n_snps} SNPs, {n_converged} converged. Results: {}",
out.display()
);
if save_plot_manhattan.is_some() || save_plot_qq.is_some() {
let (p_values, manhattan_pts) = collect_plot_data(&results, &merged, sub_filters);
if let Some(qq_path) = save_plot_qq {
gsem::plot::qq::qq_plot(&p_values, "GWAS QQ Plot", qq_path)?;
eprintln!("QQ plot: {}", qq_path.display());
}
if let Some(man_path) = save_plot_manhattan {
if manhattan_pts.is_empty() {
log::warn!(
"Manhattan plot skipped: merged sumstats {} has no CHR/BP columns",
sumstats.display()
);
} else {
gsem::plot::manhattan::manhattan_plot(
&manhattan_pts,
"GWAS Manhattan Plot",
man_path,
Some(100_000),
)?;
eprintln!("Manhattan plot: {}", man_path.display());
}
}
}
Ok(())
}
fn collect_plot_data(
results: &[gsem::gwas::user_gwas::SnpResult],
merged: &gsem::io::sumstats_reader::MergedSumstats,
sub_filters: &Option<Vec<String>>,
) -> (Vec<f64>, Vec<gsem::plot::ManhattanPoint>) {
let mut p_values = Vec::with_capacity(results.len());
let mut manhattan = Vec::new();
let chr_vec = merged.chr.as_ref();
let bp_vec = merged.bp.as_ref();
for snp_result in results {
let param = snp_result.params.iter().find(|p| {
if let Some(filters) = sub_filters {
let key = format!("{}{}{}", p.lhs, p.op, p.rhs);
filters.iter().any(|f| f == &key)
} else {
true
}
});
let Some(param) = param else { continue };
if !param.p_value.is_finite() {
continue;
}
p_values.push(param.p_value);
if let (Some(chrs), Some(bps)) = (chr_vec, bp_vec)
&& let (Some(&chr), Some(&bp)) =
(chrs.get(snp_result.snp_idx), bps.get(snp_result.snp_idx))
{
manhattan.push(gsem::plot::ManhattanPoint {
chr,
bp,
neg_log10_p: gsem::plot::neg_log10_p(param.p_value),
});
}
}
(p_values, manhattan)
}
#[allow(clippy::too_many_arguments)]
fn run_gwas_twas(
sumstats: &Path,
model_str: &str,
estimation: gsem_sem::EstimationMethod,
gc: &str,
gc_mode: gsem::gwas::gc_correction::GcMode,
k: usize,
std_lv: bool,
smooth_check: bool,
ldsc_result: &gsem_ldsc::LdscResult,
i_ld: &Mat<f64>,
sub_filters: &Option<Vec<String>>,
out: &Path,
) -> Result<()> {
eprintln!("Reading TWAS sumstats: {}", sumstats.display());
let twas_data = gsem::io::twas_reader::read_twas_sumstats(sumstats)
.with_context(|| format!("failed to read TWAS sumstats {}", sumstats.display()))?;
let n_genes = twas_data.genes.len();
eprintln!("TWAS: {n_genes} genes, {k} traits, estimation={estimation}, gc={gc}");
if twas_data.trait_names.len() != k {
anyhow::bail!(
"trait count mismatch: LDSC has {k} traits, TWAS sumstats has {} (beta.* columns)",
twas_data.trait_names.len()
);
}
let beta_gene: Vec<&[f64]> = twas_data.genes.iter().map(|g| g.beta.as_slice()).collect();
let se_gene: Vec<&[f64]> = twas_data.genes.iter().map(|g| g.se.as_slice()).collect();
let var_gene: Vec<f64> = twas_data.genes.iter().map(|g| g.hsq).collect();
let twas_model_str = model_str.replace("SNP", "Gene");
let model = gsem_sem::syntax::parse_model(&twas_model_str, std_lv)
.map_err(|e| anyhow::anyhow!("model parse error: {e}"))?;
let config = gsem::gwas::user_gwas::UserGwasConfig {
model,
estimation,
gc: gc_mode,
max_iter: 500,
smooth_check,
snp_se: None,
variant_label: gsem::gwas::user_gwas::VariantLabel::Gene,
q_snp: false,
fix_measurement: true,
num_threads: None,
};
eprintln!("Running TWAS across {n_genes} genes...");
let pb = indicatif::ProgressBar::new(n_genes as u64);
pb.set_style(
indicatif::ProgressStyle::with_template(
"[{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} genes ({eta})",
)
.unwrap(),
);
let results = gsem::gwas::user_gwas::run_user_gwas(
&config,
&ldsc_result.s,
&ldsc_result.v,
i_ld,
&beta_gene,
&se_gene,
&var_gene,
Some(&|| pb.inc(1)),
);
pb.finish_with_message("complete");
let mut output =
String::from("Gene\tPanel\tHSQ\tlhs\top\trhs\test\tse\tz\tp\tchisq\tdf\tconverged\n");
for gene_result in &results {
let gene = &twas_data.genes[gene_result.snp_idx];
for param in &gene_result.params {
if let Some(filters) = sub_filters {
let param_key = format!("{}{}{}", param.lhs, param.op, param.rhs);
if !filters.iter().any(|f| f == ¶m_key) {
continue;
}
}
output.push_str(&format!(
"{}\t{}\t{:.6}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\t{:.4}\t{}\t{}\n",
gene.gene,
gene.panel,
gene.hsq,
param.lhs,
param.op,
param.rhs,
param.est,
param.se,
param.z_stat,
param.p_value,
gene_result.chisq,
gene_result.chisq_df,
gene_result.converged,
));
}
}
std::fs::write(out, &output).with_context(|| format!("failed to write {}", out.display()))?;
let n_converged = results.iter().filter(|r| r.converged).count();
eprintln!(
"TWAS complete: {n_genes} genes, {n_converged} converged. Results: {}",
out.display()
);
Ok(())
}
fn run_commonfactor_cmd(args: CommonFactorArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let estimation = gsem_sem::EstimationMethod::from_str_lossy(&args.estimation);
eprintln!("Fitting common factor model (estimation={estimation})...");
let result =
gsem_sem::commonfactor::run_commonfactor(&ldsc_result.s, &ldsc_result.v, estimation)?;
eprintln!(
"Converged. Chi-sq={:.4}, df={}, CFI={:.4}, SRMR={:.4}",
result.fit.chisq, result.fit.df, result.fit.cfi, result.fit.srmr
);
let mut output = String::from("lhs\top\trhs\test\tse\tz\tp\n");
for param in &result.parameters {
output.push_str(&format!(
"{}\t{}\t{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\n",
param.lhs, param.op, param.rhs, param.est, param.se, param.z, param.p
));
}
output.push_str(&format!(
"\n# Model fit: chisq={:.4} df={} p={:.6e} AIC={:.4} CFI={:.4} SRMR={:.4}\n",
result.fit.chisq,
result.fit.df,
result.fit.p_chisq,
result.fit.aic,
result.fit.cfi,
result.fit.srmr
));
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Results written to {}", args.out.display());
Ok(())
}
fn run_commonfactor_gwas(args: CommonfactorGwasArgs) -> Result<()> {
if std::env::var("GSEMR_COMMONFACTOR_GWAS_QUIET")
.map(|v| v.is_empty())
.unwrap_or(true)
{
eprintln!(
"note: `gsem commonfactorGWAS` fits a single-factor model using fixed-variance\n\
identification (F1 ~~ 1*F1, loadings free) with the fix_measurement baseline\n\
optimization. This matches GenomicSEM::userGWAS on the equivalent model but does\n\
NOT numerically match GenomicSEM::commonfactorGWAS, which uses marker-indicator\n\
identification. On real GWAS data the two can disagree in sign and magnitude.\n\
See ARCHITECTURE.md section 3.3 for the full rationale. Suppress with\n\
GSEMR_COMMONFACTOR_GWAS_QUIET=1."
);
}
if let Some(t) = args.threads {
rayon::ThreadPoolBuilder::new()
.num_threads(t)
.build_global()
.context("failed to initialize thread pool")?;
}
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let gc_mode: gsem::gwas::gc_correction::GcMode = args.gc.parse().expect("infallible");
let k = ldsc_result.s.nrows();
eprintln!("Reading merged sumstats: {}", args.sumstats.display());
let merged = gsem::io::sumstats_reader::read_merged_sumstats(&args.sumstats)
.with_context(|| format!("failed to read {}", args.sumstats.display()))?;
let n_snps = merged.len();
let gc = &args.gc;
eprintln!("Common factor GWAS: {n_snps} SNPs, {k} traits, gc={gc}");
if merged.trait_names.len() != k {
anyhow::bail!(
"trait count mismatch: LDSC has {k} traits, sumstats has {} (beta.* columns)",
merged.trait_names.len()
);
}
let beta_snp = merged.beta_rows();
let se_snp = merged.se_rows();
let var_snp = merged.var_snp();
let mut i_ld = ldsc_result.i_mat.to_owned();
for i in 0..k {
if i_ld[(i, i)] < 1.0 {
i_ld[(i, i)] = 1.0;
}
}
eprintln!("Running common factor GWAS across {n_snps} SNPs...");
let pb = indicatif::ProgressBar::new(n_snps as u64);
pb.set_style(
indicatif::ProgressStyle::with_template(
"[{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} SNPs ({eta})",
)
.unwrap(),
);
let cf_config = gsem::gwas::common_factor::CommonFactorGwasConfig {
gc: gc_mode,
..Default::default()
};
let results = gsem::gwas::common_factor::run_common_factor_gwas(
&merged.trait_names,
&ldsc_result.s,
&ldsc_result.v,
&i_ld,
&beta_snp,
&se_snp,
&var_snp,
&cf_config,
Some(&|| pb.inc(1)),
);
pb.finish_with_message("complete");
let mut output = String::from("SNP\tlhs\top\trhs\test\tse\tz\tp\tchisq\tdf\tconverged\n");
for snp_result in &results {
let snp_name = &merged.snp[snp_result.snp_idx];
for param in &snp_result.params {
output.push_str(&format!(
"{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\t{:.4}\t{}\t{}\n",
snp_name,
param.lhs,
param.op,
param.rhs,
param.est,
param.se,
param.z_stat,
param.p_value,
snp_result.chisq,
snp_result.chisq_df,
snp_result.converged,
));
}
}
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
let n_converged = results.iter().filter(|r| r.converged).count();
eprintln!(
"Common factor GWAS complete: {n_snps} SNPs, {n_converged} converged. Results: {}",
args.out.display()
);
if args.save_plot_manhattan.is_some() || args.save_plot_qq.is_some() {
let (p_values, manhattan_pts) = collect_plot_data(&results, &merged, &None);
if let Some(qq_path) = args.save_plot_qq.as_deref() {
gsem::plot::qq::qq_plot(&p_values, "Common Factor GWAS QQ Plot", qq_path)?;
eprintln!("QQ plot: {}", qq_path.display());
}
if let Some(man_path) = args.save_plot_manhattan.as_deref() {
if manhattan_pts.is_empty() {
log::warn!("Manhattan plot skipped: merged sumstats has no CHR/BP columns");
} else {
gsem::plot::manhattan::manhattan_plot(
&manhattan_pts,
"Common Factor GWAS Manhattan Plot",
man_path,
Some(100_000),
)?;
eprintln!("Manhattan plot: {}", man_path.display());
}
}
}
Ok(())
}
fn run_write_model(args: WriteModelArgs) -> Result<()> {
let content = std::fs::read_to_string(&args.loadings)
.with_context(|| format!("failed to read {}", args.loadings.display()))?;
let mut rows: Vec<Vec<f64>> = Vec::new();
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let vals: Vec<f64> = trimmed
.split(['\t', ' '])
.filter(|s| !s.is_empty())
.map(|s| {
s.parse::<f64>()
.with_context(|| format!("failed to parse float: {s:?}"))
})
.collect::<Result<Vec<_>>>()?;
rows.push(vals);
}
if rows.is_empty() {
anyhow::bail!("loadings file is empty");
}
let n_rows = rows.len();
let n_cols = rows[0].len();
let loadings = Mat::from_fn(n_rows, n_cols, |i, j| rows[i][j]);
let cutoff = args.cutoff;
let fix_resid = args.fix_resid;
let bifactor = args.bifactor;
eprintln!(
"Generating model from {}x{} loadings matrix (cutoff={cutoff}, fix_resid={fix_resid}, bifactor={bifactor})",
n_rows, n_cols
);
let model_str = gsem_sem::write_model::write_model(
&loadings,
&args.names,
cutoff,
fix_resid,
bifactor,
false,
false,
);
std::fs::write(&args.out, &model_str)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Model written to {}", args.out.display());
Ok(())
}
fn run_parallel_analysis(args: ParallelAnalysisArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let k = ldsc_result.s.nrows();
let n_sim = args.n_sim;
eprintln!("Running parallel analysis ({k} traits, {n_sim} simulations)...");
let pb = indicatif::ProgressBar::new(n_sim as u64);
pb.set_style(
indicatif::ProgressStyle::with_template(
"[{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} simulations ({eta})",
)
.unwrap(),
);
let result = gsem::stats::parallel_analysis::parallel_analysis(
&ldsc_result.s,
&ldsc_result.v,
n_sim,
0.95,
false,
None,
Some(&|| pb.inc(1)),
);
pb.finish_with_message("complete");
eprintln!("Suggested number of factors: {}", result.n_factors);
let mut output = String::from("factor\tobserved\tsimulated_95\n");
for (i, (obs, sim)) in result
.observed
.iter()
.zip(result.simulated_95.iter())
.enumerate()
{
output.push_str(&format!("{}\t{:.6}\t{:.6}\n", i + 1, obs, sim));
}
output.push_str(&format!("\n# n_factors={}\n", result.n_factors));
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Results written to {}", args.out.display());
if let Some(plot_path) = args.save_plot.as_deref() {
let points: Vec<gsem::plot::scree::ScreePoint> = result
.observed
.iter()
.zip(result.simulated_95.iter())
.enumerate()
.map(|(i, (obs, sim))| gsem::plot::scree::ScreePoint {
factor: i + 1,
observed: *obs,
simulated_95: *sim,
})
.collect();
gsem::plot::scree::scree_plot(&points, "Parallel Analysis Scree Plot", plot_path)?;
eprintln!("Scree plot: {}", plot_path.display());
}
Ok(())
}
fn run_enrich(args: EnrichArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.input)
.with_context(|| format!("failed to read {}", args.input.display()))?;
#[derive(serde::Deserialize)]
struct EnrichInput {
s_baseline: Vec<Vec<f64>>,
s_annot: Vec<Vec<Vec<f64>>>,
v_annot: Vec<Vec<Vec<f64>>>,
annotation_names: Vec<String>,
m_annot: Vec<f64>,
m_total: f64,
}
let data: EnrichInput =
serde_json::from_str(&json).context("failed to parse enrichment input JSON")?;
let k = data.s_baseline.len();
let s_baseline = Mat::from_fn(k, k, |i, j| data.s_baseline[i][j]);
let s_annot: Vec<Mat<f64>> = data
.s_annot
.iter()
.map(|m| {
let rows = m.len();
let cols = if rows > 0 { m[0].len() } else { 0 };
Mat::from_fn(rows, cols, |i, j| m[i][j])
})
.collect();
let v_annot: Vec<Mat<f64>> = data
.v_annot
.iter()
.map(|m| {
let rows = m.len();
let cols = if rows > 0 { m[0].len() } else { 0 };
Mat::from_fn(rows, cols, |i, j| m[i][j])
})
.collect();
eprintln!(
"Running enrichment analysis ({} annotations, {k} traits)...",
data.annotation_names.len()
);
let result = gsem::stats::enrich::enrichment_test(
&s_baseline,
&s_annot,
&v_annot,
&data.annotation_names,
&data.m_annot,
data.m_total,
);
let mut output = String::from("annotation\tenrichment\tse\tp\n");
for i in 0..result.annotations.len() {
output.push_str(&format!(
"{}\t{:.6}\t{:.6}\t{:.6e}\n",
result.annotations[i], result.enrichment[i], result.se[i], result.p[i]
));
}
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Enrichment results written to {}", args.out.display());
Ok(())
}
fn run_rgmodel_cmd(args: RgmodelArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let estimation = gsem_sem::EstimationMethod::from_str_lossy(&args.estimation);
eprintln!("Fitting rgmodel (estimation={estimation})...");
let result = gsem_sem::rgmodel::run_rgmodel_with_model(
&ldsc_result.s,
&ldsc_result.v,
estimation,
args.model.as_deref(),
args.std_lv,
)?;
let k = result.r.nrows();
eprintln!(
"Converged. R is {k}x{k}. Underlying model chi-sq={:.4}, df={}, CFI={:.4}",
result.sem_result.fit.chisq, result.sem_result.fit.df, result.sem_result.fit.cfi
);
let mut output = String::new();
output.push_str("# Model-implied genetic correlation matrix R\n");
let col_names: Vec<String> = (0..k).map(|i| format!("V{}", i + 1)).collect();
output.push_str(&format!("\t{}\n", col_names.join("\t")));
for i in 0..k {
output.push_str(&format!("V{}", i + 1));
for j in 0..k {
output.push_str(&format!("\t{:.6}", result.r[(i, j)]));
}
output.push('\n');
}
output.push_str("\n# Sampling covariance of vech(R) - V_R\n");
let kstar = k * (k + 1) / 2;
for i in 0..kstar {
for j in 0..kstar {
if j > 0 {
output.push('\t');
}
output.push_str(&format!("{:.6e}", result.v_r[(i, j)]));
}
output.push('\n');
}
output.push_str("\n# SEM parameter estimates\n");
output.push_str("lhs\top\trhs\test\tse\tz\tp\n");
for param in &result.sem_result.parameters {
output.push_str(&format!(
"{}\t{}\t{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\n",
param.lhs, param.op, param.rhs, param.est, param.se, param.z, param.p
));
}
output.push_str(&format!(
"\n# Model fit: chisq={:.4} df={} p={:.6e} AIC={:.4} CFI={:.4} SRMR={:.4}\n",
result.sem_result.fit.chisq,
result.sem_result.fit.df,
result.sem_result.fit.p_chisq,
result.sem_result.fit.aic,
result.sem_result.fit.cfi,
result.sem_result.fit.srmr
));
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Results written to {}", args.out.display());
Ok(())
}
fn run_multi_snp_cmd(args: MultiSnpArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let k = ldsc_result.s.nrows();
let model_str = if let Some(m) = args.model {
m
} else if let Some(f) = args.model_file {
std::fs::read_to_string(&f)
.with_context(|| format!("failed to read model file {}", f.display()))?
} else {
anyhow::bail!("must provide --model or --model-file");
};
eprintln!("Reading merged sumstats: {}", args.sumstats.display());
let merged = gsem::io::sumstats_reader::read_merged_sumstats(&args.sumstats)
.with_context(|| format!("failed to read {}", args.sumstats.display()))?;
if merged.trait_names.len() != k {
anyhow::bail!(
"trait count mismatch: LDSC has {k} traits, sumstats has {} (beta.* columns)",
merged.trait_names.len()
);
}
eprintln!("Reading LD matrix: {}", args.ld_matrix.display());
let (ld_mat, _ld_names) = gsem::gwas::multi_snp::read_ld_matrix(&args.ld_matrix)?;
let n_snps = ld_mat.nrows();
eprintln!("LD matrix: {n_snps} x {n_snps}");
if n_snps > merged.len() {
anyhow::bail!(
"LD matrix has {n_snps} SNPs but sumstats only has {} SNPs",
merged.len()
);
}
let beta_snp: Vec<&[f64]> = (0..n_snps).map(|i| merged.beta_row(i)).collect();
let se_snp: Vec<&[f64]> = (0..n_snps).map(|i| merged.se_row(i)).collect();
let var_snp: Vec<f64> = (0..n_snps)
.map(|i| {
let m = merged.maf[i];
2.0 * m * (1.0 - m)
})
.collect();
let snp_names: Vec<String> = merged.snp[..n_snps].to_vec();
let model = gsem_sem::syntax::parse_model(&model_str, false)
.map_err(|e| anyhow::anyhow!("model parse error: {e}"))?;
let config = gsem::gwas::multi_snp::MultiSnpConfig {
model,
estimation: gsem_sem::EstimationMethod::from_str_lossy(&args.estimation),
max_iter: 500,
snp_var_se: None,
};
eprintln!("Running multi-SNP analysis with {n_snps} SNPs, {k} traits...");
let result = gsem::gwas::multi_snp::run_multi_snp(
&config,
&ldsc_result.s,
&ldsc_result.v,
&beta_snp,
&se_snp,
&var_snp,
&ld_mat,
&snp_names,
);
eprintln!(
"Done. converged={}, chisq={:.4}, df={}",
result.converged, result.chisq, result.chisq_df
);
let mut output = String::from("lhs\top\trhs\test\tse\tz\tp\n");
for param in &result.params {
output.push_str(&format!(
"{}\t{}\t{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\n",
param.lhs, param.op, param.rhs, param.est, param.se, param.z_stat, param.p_value
));
}
output.push_str(&format!(
"\n# chisq={:.4} df={} converged={}\n",
result.chisq, result.chisq_df, result.converged
));
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Results written to {}", args.out.display());
Ok(())
}
fn run_simulate(args: SimulateArgs) -> Result<()> {
let json = std::fs::read_to_string(&args.covstruc)
.with_context(|| format!("failed to read {}", args.covstruc.display()))?;
let ldsc_result = gsem_ldsc::LdscResult::from_json_string(&json)?;
let k = ldsc_result.s.nrows();
let n_vec: Vec<f64> = args
.n_per_trait
.split(',')
.map(|s| {
s.trim()
.parse::<f64>()
.with_context(|| format!("failed to parse sample size: {s:?}"))
})
.collect::<Result<Vec<_>>>()?;
if n_vec.len() != k {
anyhow::bail!(
"expected {k} sample sizes (one per trait), got {}",
n_vec.len()
);
}
let chromosomes: Vec<usize> = (1..=22).collect();
let ld_data = gsem::io::ld_reader::read_ld_scores(&args.ld, &args.ld, &chromosomes)
.context("failed to read LD scores")?;
let ld_scores: Vec<f64> = ld_data.records.iter().map(|r| r.l2).collect();
let n_snps = ld_scores.len();
eprintln!(
"Simulating GWAS: {k} traits, {n_snps} SNPs, M={:.0}",
ld_data.total_m
);
let z_all = gsem::stats::simulation::simulate_sumstats(
&ldsc_result.s,
&n_vec,
&ld_scores,
ld_data.total_m,
&gsem::stats::simulation::SimConfig::default(),
);
let mut output = String::new();
let headers: Vec<String> = (0..k).map(|i| format!("Z{}", i + 1)).collect();
output.push_str(&headers.join("\t"));
output.push('\n');
for s_idx in 0..n_snps {
let vals: Vec<String> = z_all
.iter()
.map(|z_t| format!("{:.6}", z_t[s_idx]))
.collect();
output.push_str(&vals.join("\t"));
output.push('\n');
}
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!(
"Simulated {n_snps} SNPs x {k} traits. Results: {}",
args.out.display()
);
Ok(())
}
fn parse_prevalences(s: &Option<String>, k: usize) -> Vec<Option<f64>> {
match s {
Some(s) => s
.split(',')
.map(|x| {
let trimmed = x.trim();
if trimmed.eq_ignore_ascii_case("na") || trimmed.is_empty() {
None
} else {
trimmed.parse().ok()
}
})
.collect(),
None => vec![None; k],
}
}
fn parse_chromosome_selection(select: &str, n_chr: usize) -> Vec<usize> {
match select.to_lowercase().as_str() {
"all" => (1..=n_chr).collect(),
"odd" => (1..=n_chr).filter(|c| c % 2 == 1).collect(),
"even" => (1..=n_chr).filter(|c| c % 2 == 0).collect(),
other => {
other
.split(',')
.filter_map(|s| s.trim().parse::<usize>().ok())
.collect()
}
}
}
fn run_hdl(args: HdlArgs) -> Result<()> {
use gsem_ldsc::hdl::{HdlConfig, HdlMethod, HdlTraitData, LdPiece};
eprintln!("Reading {} trait files for HDL...", args.traits.len());
let mut trait_data = Vec::new();
for path in &args.traits {
let records = gwas_reader::read_sumstats(path)
.with_context(|| format!("failed to read {}", path.display()))?;
let n_snps = records.len();
eprintln!(" {}: {} SNPs", path.display(), n_snps);
trait_data.push(HdlTraitData {
snp: records.iter().map(|r| r.snp.clone()).collect(),
z: records.iter().map(|r| r.z).collect(),
n: records.iter().map(|r| r.n).collect(),
a1: records.iter().map(|r| r.a1.clone()).collect(),
a2: records.iter().map(|r| r.a2.clone()).collect(),
});
}
let hdl_method = match args.method.to_lowercase().as_str() {
"jackknife" => HdlMethod::Jackknife,
_ => HdlMethod::Piecewise,
};
let config = HdlConfig {
method: hdl_method,
n_ref: args.n_ref,
};
let ld_path = &args.ld_path;
let pieces_file = ld_path.join("pieces.txt");
if !pieces_file.exists() {
anyhow::bail!(
"HDL LD reference directory not found or missing pieces.txt at {}.\n\
HDL requires a text-format LD reference panel directory containing:\n\
- pieces.txt: tab-delimited index file with columns: piece_idx, chr, n_snps\n\
- piece.{{N}}.snps.txt: per-piece SNP data with columns: SNP, A1, A2, LDscore\n\n\
The R version of GenomicSEM uses .rda files which cannot be read directly.\n\
Convert R LD reference panels to text format first, e.g. in R:\n\
\n load('UKB_SVD_eigen99_extraction/LDmatrix1.RData')\n\
\n write.table(data.frame(SNP=snps, A1=a1, A2=a2, LDscore=ld), \
'piece.1.snps.txt', sep='\\t', row.names=FALSE, quote=FALSE)",
pieces_file.display()
);
}
let pieces_content = std::fs::read_to_string(&pieces_file)
.with_context(|| format!("failed to read {}", pieces_file.display()))?;
let mut ld_pieces = Vec::new();
for line in pieces_content.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') || line.starts_with("piece") {
continue; }
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 3 {
continue;
}
let piece_idx: usize = fields[0]
.parse()
.with_context(|| format!("invalid piece index in pieces.txt: '{}'", fields[0]))?;
let snp_file = ld_path.join(format!("piece.{piece_idx}.snps.txt"));
if !snp_file.exists() {
eprintln!(
"Warning: piece file {} not found, skipping",
snp_file.display()
);
continue;
}
let snp_content = std::fs::read_to_string(&snp_file)
.with_context(|| format!("failed to read {}", snp_file.display()))?;
let mut snps = Vec::new();
let mut a1 = Vec::new();
let mut a2 = Vec::new();
let mut ld_scores = Vec::new();
for sline in snp_content.lines() {
let sline = sline.trim();
if sline.is_empty() || sline.starts_with('#') || sline.starts_with("SNP") {
continue;
}
let sf: Vec<&str> = sline.split('\t').collect();
if sf.len() < 4 {
continue;
}
snps.push(sf[0].to_string());
a1.push(sf[1].to_string());
a2.push(sf[2].to_string());
if let Ok(ld) = sf[3].parse::<f64>() {
ld_scores.push(ld);
} else {
ld_scores.push(0.0);
}
}
let m = snps.len();
if m > 0 {
ld_pieces.push(LdPiece {
snps,
a1,
a2,
ld_scores,
m,
});
}
}
if ld_pieces.is_empty() {
anyhow::bail!("no valid LD pieces loaded from {}", ld_path.display());
}
eprintln!(
"Loaded {} LD pieces ({} total SNPs)",
ld_pieces.len(),
ld_pieces.iter().map(|p| p.m).sum::<usize>()
);
let k = args.traits.len();
let sp = parse_prevalences(&args.sample_prev, k);
let pp = parse_prevalences(&args.pop_prev, k);
eprintln!("Running HDL...");
let result = gsem_ldsc::hdl::hdl(&trait_data, &sp, &pp, &ld_pieces, &config)?;
let json = result.to_json_string()?;
std::fs::write(&args.out, &json)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("HDL complete. Results written to {}", args.out.display());
let k = result.s.nrows();
eprintln!("\nGenetic covariance matrix (S):");
for i in 0..k {
let row: Vec<String> = (0..k)
.map(|j| format!("{:8.4}", result.s[(i, j)]))
.collect();
eprintln!(" {}", row.join(" "));
}
eprintln!("\nIntercepts (I):");
for i in 0..k {
let row: Vec<String> = (0..k)
.map(|j| format!("{:8.4}", result.i_mat[(i, j)]))
.collect();
eprintln!(" {}", row.join(" "));
}
Ok(())
}
fn run_summary_gls(args: SummaryGlsArgs) -> Result<()> {
let x_content = std::fs::read_to_string(&args.x)
.with_context(|| format!("failed to read {}", args.x.display()))?;
let x_rows: Vec<Vec<f64>> = x_content
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| {
l.split(['\t', ' '])
.filter(|s| !s.is_empty())
.map(|s| s.parse::<f64>().unwrap_or(0.0))
.collect()
})
.collect();
let n = x_rows.len();
if n == 0 {
anyhow::bail!("empty X matrix");
}
let p_base = x_rows[0].len();
let p = if args.intercept { p_base + 1 } else { p_base };
let x = faer::Mat::from_fn(n, p, |i, j| {
if args.intercept && j == 0 {
1.0
} else {
let col = if args.intercept { j - 1 } else { j };
x_rows[i][col]
}
});
let y_content = std::fs::read_to_string(&args.y)
.with_context(|| format!("failed to read {}", args.y.display()))?;
let y: Vec<f64> = y_content
.split_whitespace()
.filter_map(|s| s.parse().ok())
.collect();
if y.len() != n {
anyhow::bail!("Y length {} != X rows {}", y.len(), n);
}
let v_content = std::fs::read_to_string(&args.v)
.with_context(|| format!("failed to read {}", args.v.display()))?;
let v_rows: Vec<Vec<f64>> = v_content
.lines()
.filter(|l| !l.trim().is_empty())
.map(|l| {
l.split(['\t', ' '])
.filter(|s| !s.is_empty())
.map(|s| s.parse::<f64>().unwrap_or(0.0))
.collect()
})
.collect();
if v_rows.len() != n {
anyhow::bail!("V rows {} != Y length {}", v_rows.len(), n);
}
let v = faer::Mat::from_fn(n, n, |i, j| v_rows[i][j]);
eprintln!("GLS: {} observations, {} predictors", n, p);
let result = gsem::stats::gls::summary_gls(&x, &y, &v)
.ok_or_else(|| anyhow::anyhow!("GLS computation failed"))?;
let mut output = String::from("predictor\tbeta\tse\tz\tp\n");
for i in 0..result.beta.len() {
let name = if args.intercept && i == 0 {
"intercept".to_string()
} else {
format!("X{}", if args.intercept { i } else { i + 1 })
};
output.push_str(&format!(
"{}\t{:.6}\t{:.6}\t{:.4}\t{:.6e}\n",
name, result.beta[i], result.se[i], result.z[i], result.p[i]
));
}
std::fs::write(&args.out, &output)
.with_context(|| format!("failed to write {}", args.out.display()))?;
eprintln!("Results written to {}", args.out.display());
Ok(())
}