gapsmith-find 0.1.0

Pathway/reaction finder for gapsmith: selects pathways, orchestrates alignment, classifies hits, scores completeness.
Documentation
//! Pathway selection and per-reaction expansion.
//!
//! Input: a `PathwayTable` (loaded by `gapsmith-db`) plus user filters. The
//! selector walks every row, keeps those matching the requested keyword
//! (or `"all"`), and then expands each surviving row into one entry per
//! `(reaction_id, reaction_name, ec, keyrea, spont)` tuple.
//!
//! `prepare_batch_alignments.R:150-224` in the R reference does the same
//! fan-out, keyed off the same four comma-separated columns
//! (`reaId`, `reaEc`, `keyRea`, `reaName`).
//!
//! ## Keyword resolution
//!
//! The user-facing `-p` keyword is first mapped to a MetaCyc hierarchy
//! category (or a set of categories) via [`resolve_keyword`], mirroring
//! `src/gapseq_find.sh:351-421`. `-p amino` expands to
//! `Amino-Acid-Biosynthesis`, `-p all` expands to a superset of every
//! gapseq category, and any other token is matched as-is.
//!
//! Selection then runs `grep -wE $pwyKey` against the full TSV row, then
//! applies the taxonomic-range filter: a pathway is kept iff its
//! `taxrange` column contains any of the NCBI tax IDs listed in
//! `dat/taxonomy.tbl` for the requested taxonomy (default Bacteria) OR is
//! empty.

use gapsmith_db::{PathwayRow, PathwayTable};
use regex::Regex;
use std::collections::HashSet;

/// One (pathway, reaction) pair ready for alignment.
#[derive(Debug, Clone)]
pub struct ExpandedReaction {
    pub pathway: String,
    pub pathway_name: String,
    pub rxn: String,
    pub name: String,
    pub ec: String,
    pub keyrea: bool,
    pub spont: bool,
}

/// Match mode used by [`PathwaySelectOptions::keyword`].
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub enum MatchMode {
    /// Exact match against the pathway `id` column.
    Id,
    /// `|`- or `;`-split exact match against the `hierarchy` column tokens
    /// (case-insensitive). Matches `filter_pathways.R:20-23`.
    Hierarchy,
    /// Word-boundary regex search over id/hierarchy/name/altname columns.
    /// Matches the fallback branch of `filter_pathways.R:25-30`.
    Regex,
}

pub struct PathwaySelectOptions<'a> {
    /// User keyword. Interpretation depends on `mode`.
    pub keyword: &'a str,
    pub mode: MatchMode,
    /// If true, drop rows where `superpathway == TRUE` or `hierarchy`
    /// contains `Super-Pathways`. Matches `filter_pathways.R:32-34`.
    pub exclude_superpathways: bool,
    /// Only keep pathways whose status column is `TRUE`.
    pub only_active: bool,
    /// Set of NCBI tax IDs (as strings, no `TAX-` prefix) that pathways
    /// must overlap via their `taxrange` column. Pass empty to disable
    /// the taxonomic filter.
    pub valid_tax_ids: &'a [String],
}

impl<'a> Default for PathwaySelectOptions<'a> {
    fn default() -> Self {
        Self {
            keyword: "all",
            mode: MatchMode::Regex,
            exclude_superpathways: true,
            only_active: true,
            valid_tax_ids: &[],
        }
    }
}

/// Select matching pathways, then expand to per-reaction rows. Duplicate
/// (pathway, rxn, name, ec) entries collapse to one.
pub fn select(table: &PathwayTable, opts: &PathwaySelectOptions<'_>) -> Vec<ExpandedReaction> {
    let pattern = resolve_keyword(opts.keyword);
    // Split on `|` to get individual keys, the same way `filter_pathways.R`
    // does via `strsplit(args[2], "\\|")`.
    let keys_lc: Vec<String> = pattern
        .split('|')
        .map(|s| s.trim().to_ascii_lowercase())
        .filter(|s| !s.is_empty())
        .collect();

    let kw_re: Option<Regex> = if keys_lc.is_empty() || matches!(opts.mode, MatchMode::Hierarchy | MatchMode::Id) {
        None
    } else {
        let alt = keys_lc.iter().map(|s| regex::escape(s)).collect::<Vec<_>>().join("|");
        Some(
            regex::RegexBuilder::new(&format!(r"\b(?:{alt})\b"))
                .case_insensitive(true)
                .build()
                .expect("keyword regex compiles"),
        )
    };

    let tax_re: Option<Regex> = if opts.valid_tax_ids.is_empty() {
        None
    } else {
        let alt = opts
            .valid_tax_ids
            .iter()
            .map(|s| regex::escape(s))
            .collect::<Vec<_>>()
            .join("|");
        Some(Regex::new(&format!(r"TAX-(?:{alt})\b")).expect("tax regex compiles"))
    };

    let mut out = Vec::new();
    let mut seen: HashSet<(String, String, String, String)> = HashSet::new();

    for row in &table.rows {
        if opts.only_active && !is_active(row) {
            continue;
        }
        if !keys_lc.is_empty() {
            let matched = match opts.mode {
                MatchMode::Id => keys_lc.iter().any(|k| row.id.to_ascii_lowercase() == *k),
                MatchMode::Hierarchy => hierarchy_matches_any(&row.hierarchy, &keys_lc),
                MatchMode::Regex => kw_re.as_ref().unwrap().is_match(&row.id)
                    || kw_re.as_ref().unwrap().is_match(&row.hierarchy)
                    || kw_re.as_ref().unwrap().is_match(&row.name)
                    || kw_re.as_ref().unwrap().is_match(&row.altname),
            };
            if !matched {
                continue;
            }
        }
        // Superpathway filter.
        if opts.exclude_superpathways && is_superpathway(row) {
            continue;
        }
        // Reactions-empty filter — gapseq drops these upstream.
        if row.rea_id.trim().is_empty() {
            continue;
        }
        // Tax filter: keep if taxrange is empty OR matches a valid tax id.
        if let Some(ref t_re) = tax_re {
            if !row.taxrange.is_empty() && !t_re.is_match(&row.taxrange) {
                continue;
            }
        }
        expand_row(row, &mut out, &mut seen);
    }
    out
}

fn hierarchy_matches_any(hierarchy: &str, keys_lc: &[String]) -> bool {
    // Split on `|` AND `;`, matching R's `strsplit(hierarchy, "\\|")` and
    // the alternative `strsplit(hierarchy, "\\;")`.
    for token in hierarchy.split(['|', ';']) {
        let t = token.trim().to_ascii_lowercase();
        if keys_lc.iter().any(|k| k == &t) {
            return true;
        }
    }
    false
}

fn is_superpathway(row: &PathwayRow) -> bool {
    matches!(row.superpathway.to_ascii_lowercase().as_str(), "true" | "t" | "1")
        || row.hierarchy.contains("Super-Pathways")
}

/// Map a user keyword to the regex alternatives gapseq's `gapseq_find.sh`
/// plugs into `grep -wE`. Mirrors lines 351-421 of the shell script.
pub fn resolve_keyword(kw: &str) -> String {
    let aa_syn =
        "Amino-Acid-Biosynthesis|Nucleotide-Biosynthesis|Cofactor-Biosynthesis|Carbohydrates-Degradation|CARBO-BIOSYNTHESIS|Polyamine-Biosynthesis|Fatty-acid-biosynthesis|Energy-Metabolism|Terpenoid-Biosynthesis|Chorismate-Biosynthesis";
    let min_pwys = "ETOH-ACETYLCOA-ANA-PWY|GLNSYN-PWY|GLUCONEO-PWY|GLUGLNSYN-PWY|GLUTAMATE-DEG1-PWY|GLUTAMATE-SYN2-PWY|GLUTAMINEFUM-PWY|GLUTSYNIII-PWY|GLYCOLYSIS|GLYOXYLATE-BYPASS|NONOXIPENT-PWY|OXIDATIVEPENT-PWY|P185-PWY|P21-PWY|PWY0-1312|PWY0-1315|PWY0-1329|PWY0-1334|PWY0-1335|PWY0-1353|PWY0-1517|PWY0-1565|PWY0-1567|PWY0-1568|PWY-4341|PWY-5084|PWY-5480|PWY-5482|PWY-5484|PWY-5690|PWY-5766|PWY-5913|PWY-6028|PWY-6333|PWY-6543|PWY-6549|PWY66-21|PWY66-398|PWY-6697|PWY-6964|PWY-7167|PWY-7685|PWY-7686|PWY-7980|PWY-8178|PWY-8215|PWY-8274|PWY-8404|PYRUVDEHYD-PWY|TCA-1|TCA";
    match kw {
        "" => String::new(),
        "all" => "Pathways|Enzyme-Test|seed|kegg".into(),
        "amino" => "Amino-Acid-Biosynthesis".into(),
        "nucl" => "Nucleotide-Biosynthesis".into(),
        "cofactor" => "Cofactor-Biosynthesis".into(),
        "carbo" => "CARBO-BIOSYNTHESIS".into(),
        "carbo-deg" => "Carbohydrates-Degradation".into(),
        "polyamine" => "Polyamine-Biosynthesis".into(),
        "fatty" => "Fatty-acid-biosynthesis".into(),
        "energy" => "Energy-Metabolism".into(),
        "terpenoid" => "Terpenoid-Biosynthesis".into(),
        "degradation" => "Degradation".into(),
        "core" => aa_syn.into(),
        "min" => min_pwys.into(),
        "kegg" => "kegg".into(),
        other => other.into(), // literal or regex alternatives
    }
}

fn is_active(row: &PathwayRow) -> bool {
    matches!(row.status.to_ascii_lowercase().as_str(), "true" | "t" | "1")
}


fn expand_row(
    row: &PathwayRow,
    out: &mut Vec<ExpandedReaction>,
    seen: &mut HashSet<(String, String, String, String)>,
) {
    // Parallel-column splits — must PRESERVE empty slots so index
    // alignment between rea_id, rea_ec, and rea_name holds. Only
    // `key_rea` and `spont` are treated as set-like (empty-stripped).
    let rxns = split_positional(&row.rea_id, ',');
    let ecs = split_positional(&row.rea_ec, ',');
    let names = split_positional(&row.rea_name, ';');
    let keys = split_csv(&row.key_rea);
    let spont = split_csv(&row.spont);

    let key_set: HashSet<&str> = keys.iter().copied().collect();
    let spont_set: HashSet<&str> = spont.iter().copied().collect();

    // Align every parallel list to `rxns` — gapseq data guarantees reaId,
    // reaEc, reaName all share the same length, but we defensively fall
    // back to "" on mismatch.
    for (i, r) in rxns.iter().enumerate() {
        let r = r.as_str();
        if r.is_empty() {
            continue;
        }
        let ec = ecs.get(i).map(|s| s.as_str()).unwrap_or("");
        let name = names.get(i).map(|s| s.as_str()).unwrap_or("");
        let key = r == row.id || key_set.contains(r);
        let sp = spont_set.contains(r);
        let dedup = (row.id.clone(), r.to_string(), name.to_string(), ec.to_string());
        if seen.insert(dedup) {
            out.push(ExpandedReaction {
                pathway: row.id.clone(),
                pathway_name: row.name.clone(),
                rxn: r.to_string(),
                name: name.to_string(),
                ec: ec.to_string(),
                keyrea: key,
                spont: sp,
            });
        }
    }
}

fn split_csv(s: &str) -> Vec<&str> {
    s.split(',').map(str::trim).filter(|s| !s.is_empty()).collect()
}

/// Positional split — keeps empty slots so indices line up across
/// parallel columns. Trims each item. Returns `Vec<String>` so callers
/// can reason about lifetime independently.
fn split_positional(s: &str, sep: char) -> Vec<String> {
    if s.is_empty() {
        return Vec::new();
    }
    s.split(sep).map(|t| t.trim().to_string()).collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use gapsmith_db::PwySource;

    fn toy_table() -> PathwayTable {
        PathwayTable {
            source: Some(PwySource::MetaCyc),
            rows: vec![
                PathwayRow {
                    id: "PWY-A".into(),
                    name: "Glycolysis".into(),
                    altname: "".into(),
                    hierarchy: "Carbohydrate metabolism".into(),
                    taxrange: "".into(),
                    rea_id: "rxn1,rxn2,rxn3".into(),
                    rea_ec: "1.1.1.1,2.2.2.2,3.3.3.3".into(),
                    key_rea: "rxn1".into(),
                    rea_name: "enz1;enz2;enz3".into(),
                    rea_nr: 3,
                    ec_nr: 3,
                    superpathway: "".into(),
                    status: "TRUE".into(),
                    spont: "rxn3".into(),
                    source: PwySource::MetaCyc,
                },
                PathwayRow {
                    id: "PWY-B".into(),
                    name: "Lysine biosynthesis".into(),
                    altname: "".into(),
                    hierarchy: "Amino acid".into(),
                    taxrange: "".into(),
                    rea_id: "rxn10".into(),
                    rea_ec: "4.4.4.4".into(),
                    key_rea: "".into(),
                    rea_name: "enz10".into(),
                    rea_nr: 1,
                    ec_nr: 1,
                    superpathway: "".into(),
                    status: "TRUE".into(),
                    spont: "".into(),
                    source: PwySource::MetaCyc,
                },
            ],
        }
    }

    fn opts_with_keyword(kw: &'static str) -> PathwaySelectOptions<'static> {
        PathwaySelectOptions {
            keyword: kw,
            mode: MatchMode::Regex,
            exclude_superpathways: true,
            only_active: true,
            valid_tax_ids: &[],
        }
    }

    #[test]
    fn keyword_matches_name_word_boundary() {
        let t = toy_table();
        let exp = select(&t, &opts_with_keyword("Glycolysis"));
        assert_eq!(exp.len(), 3);
        assert_eq!(exp[0].pathway, "PWY-A");
    }

    #[test]
    fn keyword_all_expands_every_active_pathway() {
        // `all` → pattern matches "Pathways" but since our hierarchy
        // strings don't contain that token, nothing matches. Use a real
        // gapseq hierarchy to exercise the path.
        let mut t = toy_table();
        for r in &mut t.rows {
            r.hierarchy.push_str(",Pathways");
        }
        let exp = select(&t, &opts_with_keyword("all"));
        assert_eq!(exp.len(), 4);
    }

    #[test]
    fn keyword_shorthand_resolves_to_hierarchy() {
        let mut t = toy_table();
        t.rows[1].hierarchy = "Amino-Acid-Biosynthesis".into();
        let exp = select(
            &t,
            &PathwaySelectOptions {
                keyword: "amino",
                mode: MatchMode::Hierarchy,
                exclude_superpathways: true,
                only_active: true,
                valid_tax_ids: &[],
            },
        );
        assert_eq!(exp.len(), 1);
        assert_eq!(exp[0].pathway, "PWY-B");
    }

    #[test]
    fn taxonomy_filter() {
        let mut t = toy_table();
        t.rows[0].taxrange = "|TAX-4751|".into(); // fungi-only
        t.rows[1].taxrange = "|TAX-2|".into(); // bacteria
        let tax_ids = vec!["2".to_string(), "1224".to_string()];
        let exp = select(
            &t,
            &PathwaySelectOptions {
                keyword: "all",
                mode: MatchMode::Regex,
                exclude_superpathways: true,
                only_active: true,
                valid_tax_ids: &tax_ids,
            },
        );
        // Since neither pathway has "Pathways" in hierarchy, add it.
        let _ = exp; // other test covers 'all'
        // Instead, verify the filter directly.
        let exp = select(
            &t,
            &PathwaySelectOptions {
                keyword: "Lysine",
                mode: MatchMode::Regex,
                exclude_superpathways: true,
                only_active: true,
                valid_tax_ids: &tax_ids,
            },
        );
        assert_eq!(exp.len(), 1);
        assert_eq!(exp[0].pathway, "PWY-B");
    }
}