rsfgsea 0.3.4

High-performance fgsea-compatible preranked Gene Set Enrichment Analysis in Rust
Documentation
use crate::core::{Pathway, PathwayDb, RankedList};
use anyhow::{Context, Result};
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

pub fn read_ranked_list<P: AsRef<Path>>(path: P) -> Result<RankedList> {
    let file = File::open(path.as_ref())?;
    let reader = BufReader::new(file);

    let mut genes = Vec::new();
    let mut scores = Vec::new();
    let mut seen_genes = HashSet::new();

    for (line_idx, line) in reader.lines().enumerate() {
        let line = line?;
        if line.trim().is_empty() {
            continue;
        }

        let parts: Vec<&str> = line.split_whitespace().collect();
        if parts.len() < 2 {
            anyhow::bail!(
                "Malformed ranked-list line {}: expected at least 2 whitespace-separated columns.",
                line_idx + 1
            );
        }

        let gene = parts[0].to_string();
        let score: f64 = parts[1]
            .parse()
            .with_context(|| format!("Failed to parse score on line {}", line_idx + 1))?;
        if !score.is_finite() {
            anyhow::bail!(
                "Non-finite score '{}' found on line {}. Ranked lists must contain only finite numeric scores.",
                parts[1],
                line_idx + 1
            );
        }

        if !seen_genes.insert(gene.clone()) {
            anyhow::bail!(
                "Duplicate gene '{}' found on line {}. Ranked lists must contain unique gene IDs.",
                gene,
                line_idx + 1
            );
        }

        genes.push(gene);
        scores.push(score);
    }

    Ok(RankedList::new(genes, scores))
}

pub fn read_gmt<P: AsRef<Path>>(path: P) -> Result<PathwayDb> {
    let file = File::open(path.as_ref())?;
    let reader = BufReader::new(file);

    let mut pathways = Vec::new();

    for (line_idx, line) in reader.lines().enumerate() {
        let line = line?;
        if line.trim().is_empty() {
            continue;
        }

        let parts: Vec<&str> = line.split('\t').collect();
        if parts.len() < 3 {
            anyhow::bail!(
                "Malformed GMT line {}: expected at least 3 tab-separated columns.",
                line_idx + 1
            );
        }

        let name = parts[0].to_string();
        let description = if parts[1].is_empty() {
            None
        } else {
            Some(parts[1].to_string())
        };
        let genes = parts[2..]
            .iter()
            .map(|&s| s.trim().to_string())
            .filter(|s| !s.is_empty())
            .collect();

        pathways.push(Pathway {
            name,
            description,
            genes,
        });
    }

    Ok(PathwayDb { pathways })
}