libprosic 0.4.0

A library for calling of genomic variants using a latent variable model.
Documentation
//! This module implements FDR control by with Benjamini-Hochberg.
//! Posterior error probabilities are treated as scores and an empirical null distribution is
//! used to obtain p-values. The null distribution can e.g. be generated by swapping samples.

use std::error::Error;
use std::io;

use csv;
use itertools::Itertools;
use rust_htslib::bcf;
use bio::stats::{LogProb, PHREDProb, Prob};
use ordered_float::NotNaN;

use utils;
use Event;
use model;
use estimation::fdr::{Record, ALPHAS};


fn pval(x: NotNaN<f64>, dist: &[NotNaN<f64>]) -> LogProb {
    let i = dist.binary_search(&x).unwrap_or_else(|i| i);
    let f0 = LogProb::from(Prob(i as f64 / dist.len() as f64));
    f0.ln_one_minus_exp()
}


/// Print thresholds to control FDR of given calls at multiple levels.
///
/// # Arguments
///
/// * `calls` - BCF reader with prosic calls
/// * `null_calls` - calls under the null model, e.g. obtained by swapping tumor and normal sample
/// * `writer` - writer for resulting thresholds
/// * `event` - the event to control
/// * `vartype` - the variant type to consider
pub fn control_fdr<E: Event, W: io::Write>(
    calls: &mut bcf::Reader,
    null_calls: &mut bcf::Reader,
    writer: &mut W,
    event: &E,
    vartype: &model::VariantType) -> Result<(), Box<Error>> {
    let mut writer = csv::Writer::from_writer(writer).delimiter(b'\t');
    try!(writer.write(["FDR", "max-prob"].into_iter()));

    let null_dist = utils::collect_prob_dist(null_calls, event, vartype)?;

    if null_dist.is_empty() {
        for &alpha in &ALPHAS {
            writer.write([&format!("{}", alpha), ""].iter())?;
        }
        return Ok(());
    }

    let prob_dist = utils::collect_prob_dist(calls, event, vartype)?;
    debug!("{} observations in null distribution.", null_dist.len());
    debug!("{} observations in call distribution.", prob_dist.len());
    let pvals = prob_dist.iter().map(|&p| pval(p, &null_dist)).collect_vec();
    let m = pvals.len() as f64;
    let mk_pvals = pvals.iter().enumerate().map(|(k, &p)| (*p) + m.ln() - (m - k as f64 + 1.0).ln()).collect_vec(); // p * m / (m - k + 1)


    for &alpha in &ALPHAS {
        let mut record = Record { alpha: alpha, gamma: PHREDProb::from(Prob(1.0)) };
        let alpha = alpha.ln();
        for (&mkp, &event_prob) in mk_pvals.iter().zip(prob_dist.iter()) {
            // the pvalues will be monotonically decreasing
            if mkp <= alpha {
                // the highest pval that is below the threshold
                // record corresponding event probability
                record.gamma = PHREDProb::from(LogProb(*event_prob));
                break;
            }
        }
        writer.encode(&record)?;
    }
    writer.flush()?;

    Ok(())
}