Skip to main content

rsomics_faith_pd/
lib.rs

1use std::collections::HashMap;
2use std::io::{BufRead, Write};
3
4use rsomics_common::{Result, RsomicsError};
5use rsomics_phylo_tree::{NodeId, Tree};
6
7/// A parsed count table: per-sample columns of feature counts.
8///
9/// Layout matches the rsomics diversity family (scikit-bio / QIIME / phyloseq,
10/// transposed): first column is the feature (OTU/taxon) ID, the header row
11/// names the samples, cell `[feature][sample]` is the count.
12pub struct CountTable {
13    pub feature_ids: Vec<String>,
14    pub sample_names: Vec<String>,
15    /// One count vector per sample (column-major), each of length `feature_ids.len()`.
16    pub columns: Vec<Vec<u64>>,
17}
18
19impl CountTable {
20    /// # Errors
21    /// Errors on a missing header, a ragged row, or a non-integer count.
22    pub fn parse<R: BufRead>(reader: R, delim: char) -> Result<CountTable> {
23        let mut lines = reader.lines();
24        let header = loop {
25            match lines.next() {
26                Some(line) => {
27                    let line = line.map_err(RsomicsError::Io)?;
28                    if line.trim().is_empty() || line.starts_with('#') {
29                        continue;
30                    }
31                    break line;
32                }
33                None => return Err(RsomicsError::InvalidInput("empty count table".into())),
34            }
35        };
36        let sample_names: Vec<String> = header
37            .split(delim)
38            .skip(1)
39            .map(|s| s.trim().to_string())
40            .collect();
41        if sample_names.is_empty() {
42            return Err(RsomicsError::InvalidInput(
43                "header has no sample columns (need feature-ID column + ≥1 sample)".into(),
44            ));
45        }
46        let n = sample_names.len();
47        let mut feature_ids = Vec::new();
48        let mut columns: Vec<Vec<u64>> = vec![Vec::new(); n];
49        for (row_idx, line) in lines.enumerate() {
50            let line = line.map_err(RsomicsError::Io)?;
51            if line.trim().is_empty() || line.starts_with('#') {
52                continue;
53            }
54            let mut fields = line.split(delim);
55            let feature = fields.next().unwrap_or("").trim().to_string();
56            let mut seen = 0usize;
57            for (col, field) in fields.enumerate() {
58                if col >= n {
59                    return Err(RsomicsError::InvalidInput(format!(
60                        "row {} (feature '{feature}') has more columns than the header",
61                        row_idx + 2
62                    )));
63                }
64                let count: u64 = field.trim().parse().map_err(|_| {
65                    RsomicsError::InvalidInput(format!(
66                        "row {} (feature '{feature}'), sample '{}': '{}' is not a non-negative integer count",
67                        row_idx + 2,
68                        sample_names[col],
69                        field.trim()
70                    ))
71                })?;
72                columns[col].push(count);
73                seen += 1;
74            }
75            if seen != n {
76                return Err(RsomicsError::InvalidInput(format!(
77                    "row {} (feature '{feature}') has {seen} count columns, header has {n}",
78                    row_idx + 2
79                )));
80            }
81            feature_ids.push(feature);
82        }
83        Ok(CountTable {
84            feature_ids,
85            sample_names,
86            columns,
87        })
88    }
89}
90
91pub struct Config {
92    pub delim: char,
93    pub precision: usize,
94}
95
96/// Tree resolved for Faith's PD: branch length per node (missing → 0.0), a
97/// tip-name → node-id map, and each node's parent for root-ward walks.
98struct PdTree {
99    branch_length: Vec<f64>,
100    parent: Vec<Option<NodeId>>,
101    tip_index: HashMap<String, NodeId>,
102    n_nodes: usize,
103}
104
105impl PdTree {
106    fn build(tree: &Tree) -> Result<PdTree> {
107        let n_nodes = tree.nodes.len();
108        if tree.nodes[tree.root].children.len() > 2 {
109            return Err(RsomicsError::InvalidInput(
110                "tree is not rooted (root has more than two children)".into(),
111            ));
112        }
113
114        let mut branch_length = vec![0.0f64; n_nodes];
115        let mut parent = vec![None; n_nodes];
116        let mut tip_index = HashMap::new();
117        for node in &tree.nodes {
118            parent[node.id] = node.parent;
119            match node.branch_length {
120                Some(bl) => branch_length[node.id] = bl,
121                None if node.id == tree.root => {}
122                None => {
123                    return Err(RsomicsError::InvalidInput(
124                        "every non-root node must have a branch length".into(),
125                    ));
126                }
127            }
128            if node.children.is_empty() {
129                let name = node
130                    .name
131                    .as_deref()
132                    .ok_or_else(|| RsomicsError::InvalidInput("a tip has no name".into()))?;
133                if tip_index.insert(name.to_string(), node.id).is_some() {
134                    return Err(RsomicsError::InvalidInput(format!(
135                        "duplicate tip name '{name}' in the tree"
136                    )));
137                }
138            }
139        }
140
141        Ok(PdTree {
142            branch_length,
143            parent,
144            tip_index,
145            n_nodes,
146        })
147    }
148
149    /// Faith's PD for one sample: sum branch lengths of every node on the path
150    /// from each observed tip up to the root.
151    ///
152    /// Faith 1992 (DOI 10.1016/0006-3207(92)91201-3); array form per Hamady,
153    /// Lozupone & Knight 2010 (Fast UniFrac).
154    fn faith_pd(&self, observed_tips: &[NodeId], scratch: &mut Vec<bool>) -> f64 {
155        scratch.clear();
156        scratch.resize(self.n_nodes, false);
157        let mut sum = 0.0;
158        for &tip in observed_tips {
159            let mut cur = Some(tip);
160            while let Some(id) = cur {
161                if scratch[id] {
162                    break;
163                }
164                scratch[id] = true;
165                sum += self.branch_length[id];
166                cur = self.parent[id];
167            }
168        }
169        sum
170    }
171}
172
173pub fn run<R: BufRead, W: Write>(reader: R, out: &mut W, tree: &Tree, cfg: &Config) -> Result<()> {
174    let table = CountTable::parse(reader, cfg.delim)?;
175    let pd = PdTree::build(tree)?;
176
177    let row_tip: Vec<NodeId> = table
178        .feature_ids
179        .iter()
180        .map(|taxon| {
181            pd.tip_index.get(taxon).copied().ok_or_else(|| {
182                RsomicsError::InvalidInput(format!(
183                    "taxon '{taxon}' from the count table is not a tip in the tree"
184                ))
185            })
186        })
187        .collect::<Result<_>>()?;
188
189    writeln!(out, "sample\tfaith_pd").map_err(RsomicsError::Io)?;
190    let mut observed = Vec::new();
191    let mut scratch = Vec::new();
192    for (col, sample) in table.sample_names.iter().enumerate() {
193        observed.clear();
194        for (row, &c) in table.columns[col].iter().enumerate() {
195            if c > 0 {
196                observed.push(row_tip[row]);
197            }
198        }
199        let value = pd.faith_pd(&observed, &mut scratch);
200        writeln!(out, "{sample}\t{value:.*}", cfg.precision).map_err(RsomicsError::Io)?;
201    }
202    Ok(())
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    fn doc_tree() -> Tree {
210        Tree::from_newick(
211            "(((((U1:0.5,U2:0.5):0.5,U3:1.0):1.0):0.0,(U4:0.75,(U5:0.5,((U6:0.33,U7:0.62):0.5,U8:0.5):0.5):0.5):1.25):0.0)root;",
212        )
213        .unwrap()
214    }
215
216    fn pd_of(tree: &Tree, table: &str) -> Vec<(String, f64)> {
217        let cfg = Config {
218            delim: '\t',
219            precision: 6,
220        };
221        let mut out = Vec::new();
222        run(std::io::Cursor::new(table), &mut out, tree, &cfg).unwrap();
223        String::from_utf8(out)
224            .unwrap()
225            .lines()
226            .skip(1)
227            .map(|l| {
228                let (s, v) = l.split_once('\t').unwrap();
229                (s.to_string(), v.parse().unwrap())
230            })
231            .collect()
232    }
233
234    #[test]
235    fn skbio_doc_example() {
236        let table = "feature\tu\nU1\t1\nU2\t0\nU3\t0\nU4\t4\nU5\t1\nU6\t2\nU7\t3\nU8\t0\n";
237        assert!((pd_of(&doc_tree(), table)[0].1 - 6.95).abs() < 1e-9);
238    }
239
240    #[test]
241    fn single_taxon_reaches_root() {
242        let table = "feature\ts\nU1\t1\nU2\t0\nU3\t0\nU4\t0\nU5\t0\nU6\t0\nU7\t0\nU8\t0\n";
243        assert!((pd_of(&doc_tree(), table)[0].1 - 2.0).abs() < 1e-9);
244    }
245
246    #[test]
247    fn empty_sample_is_zero() {
248        let table = "feature\tz\nU1\t0\nU2\t0\nU3\t0\nU4\t0\nU5\t0\nU6\t0\nU7\t0\nU8\t0\n";
249        assert_eq!(pd_of(&doc_tree(), table)[0].1, 0.0);
250    }
251
252    #[test]
253    fn taxa_subset_of_tree_tips() {
254        let tree = Tree::from_newick("((A:1.0,B:2.0):0.5,(C:3.0,D:4.0):1.5)root;").unwrap();
255        let table = "feature\tx\nA\t1\nC\t1\n";
256        assert!((pd_of(&tree, table)[0].1 - 6.0).abs() < 1e-9);
257    }
258
259    #[test]
260    fn trifurcating_root_rejected() {
261        let tree = Tree::from_newick("(A:1.0,B:2.0,C:3.0)root;").unwrap();
262        let cfg = Config {
263            delim: '\t',
264            precision: 6,
265        };
266        let mut out = Vec::new();
267        let table = "feature\tx\nA\t1\nB\t1\nC\t1\n";
268        assert!(run(std::io::Cursor::new(table), &mut out, &tree, &cfg).is_err());
269    }
270
271    #[test]
272    fn unknown_taxon_rejected() {
273        let tree = Tree::from_newick("((A:1.0,B:2.0):0.5,C:3.0)root;").unwrap();
274        let cfg = Config {
275            delim: '\t',
276            precision: 6,
277        };
278        let mut out = Vec::new();
279        let table = "feature\tx\nA\t1\nZ\t1\n";
280        assert!(run(std::io::Cursor::new(table), &mut out, &tree, &cfg).is_err());
281    }
282}