1use std::collections::HashMap;
2use std::io::{BufRead, Write};
3
4use rsomics_common::{Result, RsomicsError};
5use rsomics_phylo_tree::{NodeId, Tree};
6
7pub struct CountTable {
13 pub feature_ids: Vec<String>,
14 pub sample_names: Vec<String>,
15 pub columns: Vec<Vec<u64>>,
17}
18
19impl CountTable {
20 pub fn parse<R: BufRead>(reader: R, delim: char) -> Result<CountTable> {
23 let mut lines = reader.lines();
24 let header = loop {
25 match lines.next() {
26 Some(line) => {
27 let line = line.map_err(RsomicsError::Io)?;
28 if line.trim().is_empty() || line.starts_with('#') {
29 continue;
30 }
31 break line;
32 }
33 None => return Err(RsomicsError::InvalidInput("empty count table".into())),
34 }
35 };
36 let sample_names: Vec<String> = header
37 .split(delim)
38 .skip(1)
39 .map(|s| s.trim().to_string())
40 .collect();
41 if sample_names.is_empty() {
42 return Err(RsomicsError::InvalidInput(
43 "header has no sample columns (need feature-ID column + ≥1 sample)".into(),
44 ));
45 }
46 let n = sample_names.len();
47 let mut feature_ids = Vec::new();
48 let mut columns: Vec<Vec<u64>> = vec![Vec::new(); n];
49 for (row_idx, line) in lines.enumerate() {
50 let line = line.map_err(RsomicsError::Io)?;
51 if line.trim().is_empty() || line.starts_with('#') {
52 continue;
53 }
54 let mut fields = line.split(delim);
55 let feature = fields.next().unwrap_or("").trim().to_string();
56 let mut seen = 0usize;
57 for (col, field) in fields.enumerate() {
58 if col >= n {
59 return Err(RsomicsError::InvalidInput(format!(
60 "row {} (feature '{feature}') has more columns than the header",
61 row_idx + 2
62 )));
63 }
64 let count: u64 = field.trim().parse().map_err(|_| {
65 RsomicsError::InvalidInput(format!(
66 "row {} (feature '{feature}'), sample '{}': '{}' is not a non-negative integer count",
67 row_idx + 2,
68 sample_names[col],
69 field.trim()
70 ))
71 })?;
72 columns[col].push(count);
73 seen += 1;
74 }
75 if seen != n {
76 return Err(RsomicsError::InvalidInput(format!(
77 "row {} (feature '{feature}') has {seen} count columns, header has {n}",
78 row_idx + 2
79 )));
80 }
81 feature_ids.push(feature);
82 }
83 Ok(CountTable {
84 feature_ids,
85 sample_names,
86 columns,
87 })
88 }
89}
90
91pub struct Config {
92 pub delim: char,
93 pub precision: usize,
94}
95
96struct PdTree {
99 branch_length: Vec<f64>,
100 parent: Vec<Option<NodeId>>,
101 tip_index: HashMap<String, NodeId>,
102 n_nodes: usize,
103}
104
105impl PdTree {
106 fn build(tree: &Tree) -> Result<PdTree> {
107 let n_nodes = tree.nodes.len();
108 if tree.nodes[tree.root].children.len() > 2 {
109 return Err(RsomicsError::InvalidInput(
110 "tree is not rooted (root has more than two children)".into(),
111 ));
112 }
113
114 let mut branch_length = vec![0.0f64; n_nodes];
115 let mut parent = vec![None; n_nodes];
116 let mut tip_index = HashMap::new();
117 for node in &tree.nodes {
118 parent[node.id] = node.parent;
119 match node.branch_length {
120 Some(bl) => branch_length[node.id] = bl,
121 None if node.id == tree.root => {}
122 None => {
123 return Err(RsomicsError::InvalidInput(
124 "every non-root node must have a branch length".into(),
125 ));
126 }
127 }
128 if node.children.is_empty() {
129 let name = node
130 .name
131 .as_deref()
132 .ok_or_else(|| RsomicsError::InvalidInput("a tip has no name".into()))?;
133 if tip_index.insert(name.to_string(), node.id).is_some() {
134 return Err(RsomicsError::InvalidInput(format!(
135 "duplicate tip name '{name}' in the tree"
136 )));
137 }
138 }
139 }
140
141 Ok(PdTree {
142 branch_length,
143 parent,
144 tip_index,
145 n_nodes,
146 })
147 }
148
149 fn faith_pd(&self, observed_tips: &[NodeId], scratch: &mut Vec<bool>) -> f64 {
155 scratch.clear();
156 scratch.resize(self.n_nodes, false);
157 let mut sum = 0.0;
158 for &tip in observed_tips {
159 let mut cur = Some(tip);
160 while let Some(id) = cur {
161 if scratch[id] {
162 break;
163 }
164 scratch[id] = true;
165 sum += self.branch_length[id];
166 cur = self.parent[id];
167 }
168 }
169 sum
170 }
171}
172
173pub fn run<R: BufRead, W: Write>(reader: R, out: &mut W, tree: &Tree, cfg: &Config) -> Result<()> {
174 let table = CountTable::parse(reader, cfg.delim)?;
175 let pd = PdTree::build(tree)?;
176
177 let row_tip: Vec<NodeId> = table
178 .feature_ids
179 .iter()
180 .map(|taxon| {
181 pd.tip_index.get(taxon).copied().ok_or_else(|| {
182 RsomicsError::InvalidInput(format!(
183 "taxon '{taxon}' from the count table is not a tip in the tree"
184 ))
185 })
186 })
187 .collect::<Result<_>>()?;
188
189 writeln!(out, "sample\tfaith_pd").map_err(RsomicsError::Io)?;
190 let mut observed = Vec::new();
191 let mut scratch = Vec::new();
192 for (col, sample) in table.sample_names.iter().enumerate() {
193 observed.clear();
194 for (row, &c) in table.columns[col].iter().enumerate() {
195 if c > 0 {
196 observed.push(row_tip[row]);
197 }
198 }
199 let value = pd.faith_pd(&observed, &mut scratch);
200 writeln!(out, "{sample}\t{value:.*}", cfg.precision).map_err(RsomicsError::Io)?;
201 }
202 Ok(())
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 fn doc_tree() -> Tree {
210 Tree::from_newick(
211 "(((((U1:0.5,U2:0.5):0.5,U3:1.0):1.0):0.0,(U4:0.75,(U5:0.5,((U6:0.33,U7:0.62):0.5,U8:0.5):0.5):0.5):1.25):0.0)root;",
212 )
213 .unwrap()
214 }
215
216 fn pd_of(tree: &Tree, table: &str) -> Vec<(String, f64)> {
217 let cfg = Config {
218 delim: '\t',
219 precision: 6,
220 };
221 let mut out = Vec::new();
222 run(std::io::Cursor::new(table), &mut out, tree, &cfg).unwrap();
223 String::from_utf8(out)
224 .unwrap()
225 .lines()
226 .skip(1)
227 .map(|l| {
228 let (s, v) = l.split_once('\t').unwrap();
229 (s.to_string(), v.parse().unwrap())
230 })
231 .collect()
232 }
233
234 #[test]
235 fn skbio_doc_example() {
236 let table = "feature\tu\nU1\t1\nU2\t0\nU3\t0\nU4\t4\nU5\t1\nU6\t2\nU7\t3\nU8\t0\n";
237 assert!((pd_of(&doc_tree(), table)[0].1 - 6.95).abs() < 1e-9);
238 }
239
240 #[test]
241 fn single_taxon_reaches_root() {
242 let table = "feature\ts\nU1\t1\nU2\t0\nU3\t0\nU4\t0\nU5\t0\nU6\t0\nU7\t0\nU8\t0\n";
243 assert!((pd_of(&doc_tree(), table)[0].1 - 2.0).abs() < 1e-9);
244 }
245
246 #[test]
247 fn empty_sample_is_zero() {
248 let table = "feature\tz\nU1\t0\nU2\t0\nU3\t0\nU4\t0\nU5\t0\nU6\t0\nU7\t0\nU8\t0\n";
249 assert_eq!(pd_of(&doc_tree(), table)[0].1, 0.0);
250 }
251
252 #[test]
253 fn taxa_subset_of_tree_tips() {
254 let tree = Tree::from_newick("((A:1.0,B:2.0):0.5,(C:3.0,D:4.0):1.5)root;").unwrap();
255 let table = "feature\tx\nA\t1\nC\t1\n";
256 assert!((pd_of(&tree, table)[0].1 - 6.0).abs() < 1e-9);
257 }
258
259 #[test]
260 fn trifurcating_root_rejected() {
261 let tree = Tree::from_newick("(A:1.0,B:2.0,C:3.0)root;").unwrap();
262 let cfg = Config {
263 delim: '\t',
264 precision: 6,
265 };
266 let mut out = Vec::new();
267 let table = "feature\tx\nA\t1\nB\t1\nC\t1\n";
268 assert!(run(std::io::Cursor::new(table), &mut out, &tree, &cfg).is_err());
269 }
270
271 #[test]
272 fn unknown_taxon_rejected() {
273 let tree = Tree::from_newick("((A:1.0,B:2.0):0.5,C:3.0)root;").unwrap();
274 let cfg = Config {
275 delim: '\t',
276 precision: 6,
277 };
278 let mut out = Vec::new();
279 let table = "feature\tx\nA\t1\nZ\t1\n";
280 assert!(run(std::io::Cursor::new(table), &mut out, &tree, &cfg).is_err());
281 }
282}