alevin_fry/pugutils.rs
1/*
2 * Copyright (c) 2020-2024 COMBINE-lab.
3 *
4 * This file is part of alevin-fry
5 * (see https://www.github.com/COMBINE-lab/alevin-fry).
6 *
7 * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
8 */
9
10#[allow(unused_imports)]
11use ahash::{AHasher, RandomState};
12use arrayvec::ArrayVec;
13use smallvec::SmallVec;
14use std::cmp::Ordering;
15use std::collections::{HashMap, HashSet, VecDeque};
16use std::io;
17use std::io::Write;
18
19use petgraph::prelude::*;
20use petgraph::unionfind::*;
21use petgraph::visit::NodeIndexable;
22
23use libradicl::chunk;
24use libradicl::record::AlevinFryReadRecord;
25
26use slog::{crit, info, warn};
27
28use crate::eq_class::{EqMap, EqMapType};
29use crate::quant::SplicedAmbiguityModel;
30use crate::utils as afutils;
31
32type CcMap = HashMap<u32, Vec<u32>, ahash::RandomState>;
33
34#[derive(Debug)]
35pub enum PugEdgeType {
36 NoEdge,
37 BiDirected,
38 XToY,
39 YToX,
40}
41
42#[derive(Debug)]
43pub struct PugResolutionStatistics {
44 pub used_alternative_strategy: bool,
45 pub total_mccs: u64,
46 pub ambiguous_mccs: u64,
47 pub trivial_mccs: u64,
48}
49
50/// Extracts the parsimonious UMI graphs (PUGs) from the
51/// equivalence class map for a given cell.
52/// The returned graph is a directed graph (potentially with
53/// bidirected edges) where each node consists of an (equivalence
54/// class, UMI ID) pair. Note, crucially, that the UMI ID is simply
55/// the rank of the UMI in the list of all distinct UMIs for this
56/// equivalence class. There is a directed edge between any pair of
57/// vertices whose set of transcripts overlap and whose UMIs are within
58/// a Hamming distance of 1 of each other. If one node has more than
59/// twice the frequency of the other, the edge is directed from the
60/// more frequent to the less freuqent node. Otherwise, edges are
61/// added in both directions.
62pub fn extract_graph(
63 eqmap: &EqMap,
64 pug_exact_umi: bool, // true if only identical UMIs induce an edge
65 log: &slog::Logger,
66) -> petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed> {
67 let verbose = false;
68 let mut one_edit = 0u64;
69 let mut zero_edit = 0u64;
70
71 // given 2 pairs (UMI, count), determine if an edge exists
72 // between them, and if so, what type.
73 let mut has_edge = |x: &(u64, u32), y: &(u64, u32)| -> PugEdgeType {
74 let hdist = if pug_exact_umi {
75 if x.0 == y.0 {
76 0
77 } else {
78 usize::MAX
79 }
80 } else {
81 afutils::count_diff_2_bit_packed(x.0, y.0)
82 };
83
84 if hdist == 0 {
85 zero_edit += 1;
86 return PugEdgeType::BiDirected;
87 }
88
89 if hdist < 2 {
90 one_edit += 1;
91 return if x.1 > (2 * y.1 - 1) {
92 PugEdgeType::XToY
93 } else if y.1 > (2 * x.1 - 1) {
94 PugEdgeType::YToX
95 } else {
96 PugEdgeType::BiDirected
97 };
98 }
99 PugEdgeType::NoEdge
100 };
101
102 let mut _bidirected = 0u64;
103 let mut _unidirected = 0u64;
104
105 let mut graph = DiGraphMap::<(u32, u32), ()>::new();
106 let mut hset = vec![0u8; eqmap.num_eq_classes()];
107 let mut idxvec: SmallVec<[u32; 128]> = SmallVec::new();
108
109 // insert all of the nodes up front to avoid redundant
110 // checks later.
111 for eqid in 0..eqmap.num_eq_classes() {
112 // get the info Vec<(UMI, frequency)>
113 let eq = &eqmap.eqc_info[eqid];
114 let u1 = &eq.umis;
115 for (xi, _x) in u1.iter().enumerate() {
116 graph.add_node((eqid as u32, xi as u32));
117 }
118 }
119
120 // for every equivalence class in this cell
121 for eqid in 0..eqmap.num_eq_classes() {
122 if verbose && eqid % 1000 == 0 {
123 print!("\rprocessed {:?} eq classes", eqid);
124 io::stdout().flush().expect("Could not flush stdout");
125 }
126
127 // get the info Vec<(UMI, frequency)>
128 let eq = &eqmap.eqc_info[eqid];
129
130 // for each (umi, count) pair and its index
131 let u1 = &eq.umis;
132 for (xi, x) in u1.iter().enumerate() {
133 // add a node
134 // graph.add_node((eqid as u32, xi as u32));
135
136 // for each (umi, freq) pair and node after this one
137 for (xi2, x2) in u1.iter().enumerate().skip(xi + 1) {
138 //for xi2 in (xi + 1)..u1.len() {
139 // x2 is the other (umi, freq) pair
140 //let x2 = &u1[xi2];
141
142 // add a node for it
143 // graph.add_node((eqid as u32, xi2 as u32));
144
145 // determine if an edge exists between x and x2, and if so, what kind
146 let et = has_edge(x, x2);
147 // for each type of edge, add the appropriate edge in the graph
148 match et {
149 PugEdgeType::BiDirected => {
150 graph.add_edge((eqid as u32, xi as u32), (eqid as u32, xi2 as u32), ());
151 graph.add_edge((eqid as u32, xi2 as u32), (eqid as u32, xi as u32), ());
152 _bidirected += 1;
153 //if multi_gene_vec[eqid] == true {
154 // bidirected_in_multigene += 1;
155 //}
156 }
157 PugEdgeType::XToY => {
158 graph.add_edge((eqid as u32, xi as u32), (eqid as u32, xi2 as u32), ());
159 _unidirected += 1;
160 //if multi_gene_vec[eqid] == true {
161 // unidirected_in_multigene += 1;
162 //}
163 }
164 PugEdgeType::YToX => {
165 graph.add_edge((eqid as u32, xi2 as u32), (eqid as u32, xi as u32), ());
166 _unidirected += 1;
167 //if multi_gene_vec[eqid] == true {
168 // unidirected_in_multigene += 1;
169 //}
170 }
171 PugEdgeType::NoEdge => {}
172 }
173 }
174 }
175
176 //hset.clear();
177 //hset.resize(eqmap.num_eq_classes(), 0u8);
178 for i in &idxvec {
179 hset[*i as usize] = 0u8;
180 }
181 let stf = idxvec.len() > 128;
182 idxvec.clear();
183 if stf {
184 idxvec.shrink_to_fit();
185 }
186
187 // for every reference id in this eq class
188 for r in eqmap.refs_for_eqc(eqid as u32) {
189 // find the equivalence classes sharing this reference
190 for eq2id in eqmap.eq_classes_containing(*r).iter() {
191 // if eq2id <= eqid, then we already observed the relevant edges
192 // when we process eq2id
193 if (*eq2id as usize) <= eqid {
194 continue;
195 }
196 // otherwise, if we have already processed this other equivalence
197 // class because it shares _another_ reference (apart from r) with
198 // the current equivalence class, then skip it.
199 if hset[*eq2id as usize] > 0 {
200 continue;
201 }
202
203 // recall that we processed this eq class as a neighbor of eqid
204 hset[*eq2id as usize] = 1;
205 idxvec.push(*eq2id);
206 let eq2 = &eqmap.eqc_info[*eq2id as usize];
207
208 // compare all the umis between eqid and eq2id
209 let u2 = &eq2.umis;
210 for (xi, x) in u1.iter().enumerate() {
211 // Node for equiv : eqid and umi : xi
212 // graph.add_node((eqid as u32, xi as u32));
213
214 for (yi, y) in u2.iter().enumerate() {
215 // Node for equiv : eq2id and umi : yi
216 // graph.add_node((*eq2id as u32, yi as u32));
217
218 let et = has_edge(x, y);
219 match et {
220 PugEdgeType::BiDirected => {
221 graph.add_edge((eqid as u32, xi as u32), (*eq2id, yi as u32), ());
222 graph.add_edge((*eq2id, yi as u32), (eqid as u32, xi as u32), ());
223 _bidirected += 1;
224 //if multi_gene_vec[eqid] == true
225 // || multi_gene_vec[*eq2id as usize] == true
226 //{
227 // bidirected_in_multigene += 1;
228 //}
229 }
230 PugEdgeType::XToY => {
231 graph.add_edge((eqid as u32, xi as u32), (*eq2id, yi as u32), ());
232 _unidirected += 1;
233 //if multi_gene_vec[eqid] == true
234 // || multi_gene_vec[*eq2id as usize] == true
235 //{
236 // unidirected_in_multigene += 1;
237 //}
238 }
239 PugEdgeType::YToX => {
240 graph.add_edge((*eq2id, yi as u32), (eqid as u32, xi as u32), ());
241 _unidirected += 1;
242 //if multi_gene_vec[eqid] == true
243 // || multi_gene_vec[*eq2id as usize] == true
244 //{
245 // unidirected_in_multigene += 1;
246 //}
247 }
248 PugEdgeType::NoEdge => {}
249 }
250 }
251 }
252 }
253 }
254 }
255
256 if verbose {
257 info!(
258 log,
259 "\n\nsize of graph ({:?}, {:?})\n\n",
260 graph.node_count(),
261 graph.edge_count()
262 );
263 let total_edits = (one_edit + zero_edit) as f64;
264 info!(log, "\n\n\n{}\n\n\n", one_edit as f64 / total_edits);
265 }
266
267 graph
268}
269
270/// Extract the weakly connected components from the directed graph
271/// G. Interestingly, `petgraph` has a builtin algorithm for returning
272/// the strongly-connected components of a digraph, and they have an
273/// algorithm for returning the _number_ of connected components of an
274/// undirected graph, but no algorithm for returning the actual
275/// connected components. So, we build our own using their union
276/// find data structure. This returns a HashMap, mapping each
277/// connected component id (a u32) to the corresponding list of vertex
278/// ids (also u32s) contained in the connected component.
279pub fn weakly_connected_components<G>(g: G) -> CcMap
280where
281 G: petgraph::visit::NodeCompactIndexable + petgraph::visit::IntoEdgeReferences,
282{
283 let mut vertex_sets = UnionFind::new(g.node_bound());
284 for edge in g.edge_references() {
285 let (a, b) = (edge.source(), edge.target());
286
287 // union the two vertices of the edge
288 vertex_sets.union(g.to_index(a), g.to_index(b));
289 }
290 let labels = vertex_sets.into_labeling();
291 fn get_map() -> CcMap {
292 let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
293 HashMap::with_hasher(s)
294 }
295
296 let mut components = get_map();
297 for (i, v) in labels.iter().enumerate() {
298 let ve = components.entry(*v as u32).or_default();
299 ve.push(i as u32);
300 }
301 components
302}
303
304/// Find the largest monochromatic spanning arboresence
305/// in the graph `g` starting at vertex `v`. The arboresence
306/// is monochromatic if every vertex can be "covered" by a single
307/// transcript (i.e. there exists a transcript that appears in the
308/// equivalence class labels of all vertices in the arboresence).
309fn collapse_vertices(
310 v: u32,
311 uncovered_vertices: &HashSet<u32, ahash::RandomState>, // the set of vertices already covered
312 g: &petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed>,
313 eqmap: &EqMap,
314 hasher_state: &ahash::RandomState,
315) -> (Vec<u32>, u32) {
316 // get a new set to hold vertices
317 type VertexSet = HashSet<u32, ahash::RandomState>;
318 let get_set =
319 |cap: u32| VertexSet::with_capacity_and_hasher(cap as usize, hasher_state.clone());
320
321 // will hold the nodes in the largest arboresence found
322 let mut largest_mcc: Vec<u32> = Vec::new();
323 let mut chosen_txp = 0u32;
324 let vert = g.from_index(v as usize);
325
326 //unsafe {
327
328 let nvert = g.node_count();
329
330 // for every transcript in the equivalence class
331 // label of the vertex
332 for txp in eqmap.refs_for_eqc(vert.0).iter() {
333 // start a bfs from this vertex
334 let mut bfs_list = VecDeque::new();
335 bfs_list.push_back(v);
336
337 // the set to remember the nodes we've already
338 // visited
339 let mut visited_set = get_set(nvert as u32);
340 visited_set.insert(v);
341
342 // will hold the current arboresence we
343 // are constructing
344 let mut current_mcc = Vec::new();
345
346 // get the next vertex in the BFS
347 while let Some(cv) = bfs_list.pop_front() {
348 // add it to the arboresence
349 current_mcc.push(cv);
350
351 // for all of the neighboring vertices that we can
352 // reach (those with outgoing, or bidirected edges)
353 for nv in g.neighbors_directed(g.from_index(cv as usize), Outgoing) {
354 let n = g.to_index(nv) as u32;
355
356 // check if we should add this vertex or not:
357 // uncovered_vertices contains the the current set of
358 // *uncovered* vertices in this component (i.e. those
359 // that we still need to explain by some molecule).
360 // so, if n is *not* in uncovered_vertices, then it is not in the
361 // uncovered set, and so it has already been
362 // explained / covered.
363 //
364 // if n hasn't been covered yet, then
365 // check if we've seen n in this traversal
366 // yet. The `insert()` method returns true
367 // if the set didn't have the element, false
368 // otherwise.
369 if !uncovered_vertices.contains(&n) || !visited_set.insert(n) {
370 continue;
371 }
372
373 // get the set of transcripts present in the
374 // label of the current node.
375 let n_labels = eqmap.refs_for_eqc(nv.0);
376 if let Ok(_n) = n_labels.binary_search(txp) {
377 bfs_list.push_back(n);
378 }
379 }
380 }
381
382 // if this arboresence is the largest we've yet
383 // seen, then record it
384 if largest_mcc.len() < current_mcc.len() {
385 largest_mcc = current_mcc;
386 chosen_txp = *txp;
387 }
388 }
389 //}// unsafe
390
391 (largest_mcc, chosen_txp)
392}
393
394#[inline]
395fn resolve_num_molecules_crlike_from_vec_prefer_ambig(
396 umi_gene_count_vec: &mut [(u64, u32, u32)],
397 gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
398) {
399 // sort the triplets
400 // first on umi
401 // then on gene_id
402 // then on count
403 umi_gene_count_vec.sort_unstable();
404
405 // hold the current umi and gene we are examining
406 let mut curr_umi = umi_gene_count_vec.first().expect("cell with no UMIs").0;
407 let first_gn = umi_gene_count_vec.first().expect("cell with no UMIs").1;
408 // The capacity of curr_gn is 2 as it will be used to hold the
409 // the spliced id of a gene, the unspliced id of a gene, or both
410 let mut curr_gn = ArrayVec::<u32, 2>::new();
411 curr_gn.push(first_gn);
412
413 // hold the gene id having the max count for this umi
414 // and the maximum count value itself
415 let mut max_count = 0u32;
416 // to aggregate the count should a (umi, gene) pair appear
417 // more than once
418 let mut count_aggr = 0u32;
419 // the vector will hold the equivalent set of best genes
420 let mut best_genes = Vec::<u32>::with_capacity(16);
421
422 // look over all sorted triplets
423 for (cidx, &(umi, gn, ct)) in umi_gene_count_vec.iter().enumerate() {
424 // if this umi is different than
425 // the one we are processing
426 // then decide what action to take
427 // on the previous umi
428 if umi != curr_umi {
429 // update the count of the equivalence class of genes
430 // that gets this UMI
431
432 *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
433
434 // the next umi and gene
435 curr_umi = umi;
436 curr_gn.clear();
437 curr_gn.push(gn);
438
439 // current gene is current best
440 best_genes.clear();
441 best_genes.push(gn);
442
443 // count aggr = max count = ct
444 count_aggr = ct;
445 max_count = ct;
446 } else {
447 // the umi was the same
448
449 let prev_gid = *curr_gn.last().expect("not empty");
450
451 // if the gene is the same (modulo splicing), add the counts
452 if afutils::same_gene(gn, prev_gid, true) {
453 // if the gene is the same modulo splicing, but
454 // curr_gn != gn, then this is gene will be set as ambiguous,
455 // this is the transition from counting occurrences
456 // of this UMI from the spliced to unspliced version
457 // of the gene.
458 if prev_gid != gn {
459 // mark the current gene as
460 // splicing ambiguous by pushing
461 // the unspliced id onto curr_gn
462 curr_gn.push(gn);
463 }
464 count_aggr += ct;
465 } else {
466 // if the gene is different, then restart the count_aggr
467 // and set the current gene id
468 count_aggr = ct;
469 curr_gn.clear();
470 curr_gn.push(gn);
471 }
472
473 // we have the following cases, consider we are
474 // processing the records for gene g_i. If
475 // * curr_gn = [g_i^s], then we have so far only observed spliced reads for g_i
476 // * curr_gn = [g_i^u], then we have only observed unspliced reads for g_i
477 // (and there are no spliced reads)
478 // * curr_gn = [g_i^s, g_i^u], then we have observed both spliced and unspliced
479 // reads for g_i, and this gene will be considered splicing ambiguous.
480
481 // the current best_genes is either empty or contains some
482 // set of genes g_i-k, ..., g_i-1, and now,
483 // g_i matches their count. If g_i has only observed spliced reads,
484 // then we will add g_i^s to the list. If g_i has observed
485 // only unspliced reads then we will add g_i^u, otherwise
486 // we will add both g_i^s and g_i^u.
487
488 // the current best_genes is either emtpy or contains some
489 // set of genes g_i-k, ..., g_i-1, and now,
490 // g_i *exceeds* their count. If g_i has only observed spliced reads,
491 // then we will *clear* best_genes, and replace it with g_i^s.
492 // If g_i has only observed unspliced reads, then we will *clear*
493 // best_genes and replace it with g_i^u. Othewise we will *clear*
494 // best_genes and replace it with [g_i^s, g_i^u];
495
496 // if the count aggregator exceeded the max
497 // then it is the new max, and this gene is
498 // the new max gene. Having a distinct max
499 // also makes this UMI uniquely resolvable
500 match count_aggr.cmp(&max_count) {
501 Ordering::Greater => {
502 max_count = count_aggr;
503 best_genes.clear();
504 best_genes.extend(curr_gn.iter());
505 }
506 Ordering::Equal => {
507 // if we have a tie for the max count
508 // then the current UMI isn't uniquely-unresolvable
509 // it will stay this way unless we see a bigger
510 // count for this UMI. We add the current
511 // "tied" gene to the equivalence class.
512 best_genes.extend(curr_gn.iter());
513 }
514 Ordering::Less => {
515 // we do nothing
516 }
517 }
518 }
519
520 // if this was the last UMI in the list
521 if cidx == umi_gene_count_vec.len() - 1 {
522 *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
523 }
524 }
525}
526
527#[inline]
528fn resolve_num_molecules_crlike_from_vec(
529 umi_gene_count_vec: &mut [(u64, u32, u32)],
530 gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
531) {
532 // sort the triplets
533 // first on umi
534 // then on gene_id
535 // then on count
536 umi_gene_count_vec.sort_unstable();
537
538 // hold the current umi and gene we are examining
539 let mut curr_umi = umi_gene_count_vec.first().expect("cell with no UMIs").0;
540 let mut curr_gn = umi_gene_count_vec.first().expect("cell with no UMIs").1;
541 // hold the gene id having the max count for this umi
542 // and the maximum count value itself
543 // let mut max_count_gene = 0u32;
544 let mut max_count = 0u32;
545 // to aggregate the count should a (umi, gene) pair appear
546 // more than once
547 let mut count_aggr = 0u32;
548 // the vector will hold the equivalent set of best genes
549 let mut best_genes = Vec::<u32>::with_capacity(16);
550
551 // look over all sorted triplets
552 for (cidx, &(umi, gn, ct)) in umi_gene_count_vec.iter().enumerate() {
553 // if this umi is different than
554 // the one we are processing
555 // then decide what action to take
556 // on the previous umi
557 if umi != curr_umi {
558 // update the count of the equivalence class of genes
559 // that gets this UMI
560 *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
561
562 // the next umi and gene
563 curr_umi = umi;
564 curr_gn = gn;
565
566 // current gene is current best
567 // max_count_gene = gn;
568 best_genes.clear();
569 best_genes.push(gn);
570
571 // count aggr = max count = ct
572 count_aggr = ct;
573 max_count = ct;
574 } else {
575 // the umi was the same
576
577 // if the gene is the same, add the counts
578 if gn == curr_gn {
579 count_aggr += ct;
580 } else {
581 // if the gene is different, then restart the count_aggr
582 // and set the current gene id
583 count_aggr = ct;
584 curr_gn = gn;
585 }
586 // if the count aggregator exceeded the max
587 // then it is the new max, and this gene is
588 // the new max gene. Having a distinct max
589 // also makes this UMI uniquely resolvable
590 match count_aggr.cmp(&max_count) {
591 Ordering::Greater => {
592 max_count = count_aggr;
593 // we want to avoid the case that we are just
594 // updating the count of the best gene above and
595 // here we clear out the vector and populate it
596 // with the same element again and again. So
597 // if the current best_genes vector holds just
598 // gn, we do nothing. Otherwise we clear it and
599 // add gn.
600 match &best_genes[..] {
601 [x] if *x == gn => { /* do nothing here */ }
602 _ => {
603 best_genes.clear();
604 best_genes.push(gn);
605 }
606 }
607 }
608 Ordering::Equal => {
609 // if we have a tie for the max count
610 // then the current UMI isn't uniquely-unresolvable
611 // it will stay this way unless we see a bigger
612 // count for this UMI. We add the current
613 // "tied" gene to the equivalence class.
614 best_genes.push(gn);
615 }
616 Ordering::Less => {
617 // we do nothing
618 }
619 }
620 }
621
622 // if this was the last UMI in the list
623 if cidx == umi_gene_count_vec.len() - 1 {
624 *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
625 }
626 }
627}
628
629pub fn get_num_molecules_cell_ranger_like_small(
630 cell_chunk: &mut chunk::Chunk<AlevinFryReadRecord>,
631 tid_to_gid: &[u32],
632 _num_genes: usize,
633 gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
634 sa_model: SplicedAmbiguityModel,
635 _log: &slog::Logger,
636) {
637 let mut umi_gene_count_vec: Vec<(u64, u32, u32)> = Vec::with_capacity(cell_chunk.nrec as usize);
638
639 // for each record
640 for rec in &cell_chunk.reads {
641 // get the umi
642 let umi = rec.umi;
643
644 // project the transcript ids to gene ids
645 let mut gset: Vec<u32> = rec
646 .refs
647 .iter()
648 .map(|tid| tid_to_gid[*tid as usize])
649 .collect();
650 // and make the gene ids unique
651 gset.sort_unstable();
652 gset.dedup();
653 for g in &gset {
654 umi_gene_count_vec.push((umi, *g, 1));
655 }
656 }
657 match sa_model {
658 SplicedAmbiguityModel::WinnerTakeAll => {
659 resolve_num_molecules_crlike_from_vec(&mut umi_gene_count_vec, gene_eqclass_hash);
660 }
661 SplicedAmbiguityModel::PreferAmbiguity => {
662 resolve_num_molecules_crlike_from_vec_prefer_ambig(
663 &mut umi_gene_count_vec,
664 gene_eqclass_hash,
665 );
666 }
667 }
668}
669
670pub fn get_num_molecules_cell_ranger_like(
671 eq_map: &EqMap,
672 tid_to_gid: &[u32],
673 _num_genes: usize,
674 gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
675 sa_model: SplicedAmbiguityModel,
676 _log: &slog::Logger,
677) {
678 // TODO: better capacity
679 let mut umi_gene_count_vec: Vec<(u64, u32, u32)> = vec![];
680
681 // for each equivalence class
682 for eqinfo in &eq_map.eqc_info {
683 // get the (umi, count) pairs
684 let umis = &eqinfo.umis;
685 let eqid = &eqinfo.eq_num;
686
687 // project the transcript ids to gene ids
688 let mut gset: Vec<u32> = eq_map
689 .refs_for_eqc(*eqid)
690 .iter()
691 .map(|tid| tid_to_gid[*tid as usize])
692 .collect();
693 // and make the gene ids unique,
694 // note, if we have both spliced and
695 // unspliced gene ids, then they will
696 // necessarily be adjacent here, since
697 // they are always asigned adjacent ids
698 // with spliced being even and unspliced odd.
699 gset.sort_unstable();
700 gset.dedup();
701
702 // add every (umi, count), gene pair as a triplet
703 // of (umi, gene_id, count) to the output vector
704 for umi_ct in umis {
705 for g in &gset {
706 umi_gene_count_vec.push((umi_ct.0, *g, umi_ct.1));
707 }
708 }
709 }
710 match sa_model {
711 SplicedAmbiguityModel::WinnerTakeAll => {
712 resolve_num_molecules_crlike_from_vec(&mut umi_gene_count_vec, gene_eqclass_hash);
713 }
714 SplicedAmbiguityModel::PreferAmbiguity => {
715 resolve_num_molecules_crlike_from_vec_prefer_ambig(
716 &mut umi_gene_count_vec,
717 gene_eqclass_hash,
718 );
719 }
720 }
721}
722
723pub fn get_num_molecules_trivial_discard_all_ambig(
724 eq_map: &EqMap,
725 tid_to_gid: &[u32],
726 num_genes: usize,
727 _log: &slog::Logger,
728) -> (Vec<f32>, f64) {
729 let mut counts = vec![0.0f32; num_genes];
730 let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
731 let mut gene_map: std::collections::HashMap<u32, Vec<u64>, ahash::RandomState> =
732 HashMap::with_hasher(s);
733
734 let mut total_umis = 0u64;
735 let mut multi_gene_umis = 0u64;
736
737 for eqinfo in &eq_map.eqc_info {
738 let umis = &eqinfo.umis;
739 let eqid = &eqinfo.eq_num;
740 let tset = eq_map.refs_for_eqc(*eqid);
741 let mut prev_gene_id = u32::MAX;
742 let mut multi_gene = false;
743 // if this is a single-gene equivalence class
744 // then go ahead and assign the read
745 for t in tset {
746 let gid = tid_to_gid[*t as usize];
747 if gid != prev_gene_id && prev_gene_id < u32::MAX {
748 multi_gene = true;
749 break;
750 }
751 prev_gene_id = gid;
752 }
753
754 total_umis += umis.len() as u64;
755 if multi_gene {
756 multi_gene_umis += umis.len() as u64;
757 }
758
759 // if the read is single-gene
760 // then add this equivalence class' list
761 // of UMIs in the gene map
762 if !multi_gene {
763 gene_map
764 .entry(prev_gene_id)
765 .or_default()
766 .extend(umis.iter().map(|x| x.0));
767 }
768 }
769
770 // go over the map and merge umis from different
771 // equivalence classes that still map to the same
772 // gene.
773 for (k, v) in gene_map.iter_mut() {
774 v.sort_unstable();
775 v.dedup();
776 // the count is the number of distinct UMIs.
777 counts[*k as usize] += v.len() as f32;
778 }
779
780 // return the counts
781 (counts, multi_gene_umis as f64 / total_umis as f64)
782}
783
784/// given the connected component (subgraph) of `g` defined by the
785/// vertices in `vertex_ids`, apply the cell-ranger-like algorithm
786/// within this subgraph.
787fn get_num_molecules_large_component(
788 g: &petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed>,
789 eq_map: &EqMap,
790 vertex_ids: &[u32],
791 tid_to_gid: &[u32],
792 hasher_state: &ahash::RandomState,
793 gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
794 _log: &slog::Logger,
795) {
796 let gene_level_eq_map = match eq_map.map_type {
797 EqMapType::GeneLevel => true,
798 EqMapType::TranscriptLevel => false,
799 };
800
801 // TODO: better capacity
802 let mut umi_gene_count_vec: Vec<(u64, u32, u32)> = vec![];
803
804 // build a temporary hashmap from each
805 // equivalence class id in the current subgraph
806 // to the set of (UMI, frequency) pairs contained
807 // in the subgraph
808 //let ts = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
809 let mut tmp_map =
810 HashMap::<u32, Vec<(u64, u32)>, ahash::RandomState>::with_hasher(hasher_state.clone());
811
812 // for each vertex id in the subgraph
813 for vertex_id in vertex_ids {
814 // get the corresponding vertex which is
815 // an (eq_id, UMI index) pair
816 let vert = g.from_index(*vertex_id as usize);
817 // add the corresponding (UMI, frequency) pair to the map
818 // for this eq_id
819 let umis = tmp_map.entry(vert.0).or_default();
820 umis.push(eq_map.eqc_info[vert.0 as usize].umis[vert.1 as usize]);
821 }
822
823 for (k, v) in tmp_map.iter() {
824 // get the (umi, count) pairs
825 let umis = v; //&eqinfo.umis;
826 let eqid = k; //&eqinfo.eq_num;
827 // project the transcript ids to gene ids
828 let mut gset: Vec<u32>;
829
830 if gene_level_eq_map {
831 gset = eq_map.refs_for_eqc(*eqid).to_vec();
832 } else {
833 gset = eq_map
834 .refs_for_eqc(*eqid)
835 .iter()
836 .map(|tid| tid_to_gid[*tid as usize])
837 .collect();
838 // and make the gene ids unique
839 gset.sort_unstable();
840 gset.dedup();
841 }
842
843 // add every (umi, count), gene pair as a triplet
844 // of (umi, gene_id, count) to the output vector
845 for umi_ct in umis {
846 for g in &gset {
847 umi_gene_count_vec.push((umi_ct.0, *g, umi_ct.1));
848 }
849 }
850 }
851
852 resolve_num_molecules_crlike_from_vec(&mut umi_gene_count_vec, gene_eqclass_hash);
853}
854
855/// Given the digraph `g` representing the PUGs within the current
856/// cell, the EqMap `eqmap` to decode all equivalence classes
857/// and the transcript-to-gene map `tid_to_gid`, apply the parsimonious
858/// umi resolution algorithm. Pass any relevant logging messages along to
859/// `log`.
860pub fn get_num_molecules(
861 g: &petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed>,
862 eqmap: &EqMap,
863 tid_to_gid: &[u32],
864 gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
865 hasher_state: &ahash::RandomState,
866 large_graph_thresh: usize,
867 log: &slog::Logger,
868) -> PugResolutionStatistics
869//,)
870{
871 type U32Set = HashSet<u32, ahash::RandomState>;
872 let get_set = |cap: u32| {
873 //let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
874 U32Set::with_capacity_and_hasher(cap as usize, hasher_state.clone())
875 };
876
877 let gene_level_eq_map = match eqmap.map_type {
878 EqMapType::GeneLevel => true,
879 EqMapType::TranscriptLevel => false,
880 };
881
882 let comps = weakly_connected_components(g);
883 // a vector of length 2 that records at index 0
884 // the number of single-node subgraphs that are
885 // transcript-unique and at index 1 the number of
886 // single-node subgraphs that have more than one
887 // associated transcript.
888 let mut one_vertex_components: Vec<usize> = vec![0, 0];
889
890 // Make gene-level eqclasses.
891 // This is a map of gene ids to the count of
892 // _de-duplicated_ reads observed for that set of genes.
893 // For every gene set (label) of length 1, these are gene
894 // unique reads. Standard scRNA-seq counting results
895 // can be obtained by simply discarding all equivalence
896 // classes of size greater than 1, and probabilistic results
897 // will attempt to resolve gene multi-mapping reads by
898 // running and EM algorithm.
899 //let s = fasthash::RandomState::<Hash64>::new();
900 //let mut gene_eqclass_hash: HashMap<Vec<u32>, u32, fasthash::RandomState<Hash64>> =
901 // HashMap::with_hasher(s);
902
903 // Get the genes that could potentially explain all
904 // of the vertices in this mcc.
905 // To do this, we first extract the set of _transcripts_
906 // that label all vertices of the mcc, and then we project
907 // the transcripts to their corresponding gene ids.
908 //let mut global_txps : Vec<u32>;
909 let mut global_txps = get_set(16);
910 let mut pug_stats = PugResolutionStatistics {
911 used_alternative_strategy: false,
912 total_mccs: 0u64,
913 ambiguous_mccs: 0u64,
914 trivial_mccs: 0u64,
915 };
916
917 for (_comp_label, comp_verts) in comps.iter() {
918 if comp_verts.len() > 1 {
919 // the current parsimony resolution algorithm
920 // can become slow for connected components that
921 // are very large. For components with > large_graph_thresh
922 // vertices (this should be _very_ rare) we will instead
923 // resolve the UMIs in the component using a simpler algorithm.
924 if comp_verts.len() > large_graph_thresh {
925 get_num_molecules_large_component(
926 g,
927 eqmap,
928 comp_verts,
929 tid_to_gid,
930 hasher_state,
931 gene_eqclass_hash,
932 log,
933 );
934 warn!(
935 log,
936 "found connected component with {} vertices; resolved with cr-like resolution.",
937 comp_verts.len(),
938 );
939 pug_stats.used_alternative_strategy = true;
940 continue;
941 }
942
943 // uncovered_vertices will hold the set of vertices that are
944 // *not yet* covered.
945 //
946 // Non-deterministic variant :
947 // NOTE: The line below places the vertices into a HashSet that uses
948 // a hasher with a RandomState which, by default, Rust will randomize between
949 // runs. That means that the output of the entire algorithm will, in general,
950 // not be deterministic. By using a RandomState with fixed seeds, this can
951 // be made deterministic (see below), but it is unclear if this will increase
952 // bias of resolving components in favor of certain transcripts (and therefore genes)
953 // that tend to appear first in hash iteration order.
954 // let mut uncovered_vertices = comp_verts.iter().cloned().collect::<HashSet<u32, ahash::RandomState>>();
955
956 // Deterministic variant : replacing the above line with these two lines will
957 // cause the parsimony resolution to be deterministic, but potentially at the
958 // cost of increasing bias.
959 let mut uncovered_vertices = get_set(comp_verts.len() as u32);
960 for v in comp_verts.iter().cloned() {
961 uncovered_vertices.insert(v);
962 }
963
964 // we will remove covered vertices from uncovered_vertices until they are
965 // all gone (until all vertices have been covered)
966 while !uncovered_vertices.is_empty() {
967 let num_remaining = uncovered_vertices.len();
968 // will hold vertices in the best mcc
969 let mut best_mcc: Vec<u32> = Vec::new();
970 // the transcript that is responsible for the
971 // best mcc covering
972 let mut best_covering_txp = u32::MAX;
973 // for each vertex in the vertex set
974 for v in uncovered_vertices.iter() {
975 // find the largest mcc starting from this vertex
976 // and the transcript that covers it
977 // NOTE: what if there are multiple different mccs that
978 // are equally good? (@k3yavi — I don't think this case
979 // is even handled in the C++ code either).
980 let (new_mcc, covering_txp) =
981 collapse_vertices(*v, &uncovered_vertices, g, eqmap, hasher_state);
982
983 let mcc_len = new_mcc.len();
984 // if the new mcc is better than the current best, then
985 // it becomes the new best
986 if best_mcc.len() < mcc_len {
987 best_mcc = new_mcc;
988 best_covering_txp = covering_txp;
989 }
990 // we can't do better than covering all
991 // remaining uncovered vertices. So, if we
992 // accomplish that, then quit here.
993 if mcc_len == num_remaining {
994 break;
995 }
996 }
997
998 if best_covering_txp == u32::MAX {
999 crit!(log, "Could not find a covering transcript");
1000 std::process::exit(1);
1001 }
1002
1003 // get gene_id of best covering transcript
1004 let best_covering_gene = if gene_level_eq_map {
1005 best_covering_txp
1006 } else {
1007 tid_to_gid[best_covering_txp as usize]
1008 };
1009
1010 //unsafe {
1011 global_txps.clear();
1012 // We iterate over all vertices in the mcc, and for
1013 // each one, we keep track of the (monotonically
1014 // non-increasing) set of transcripts that have appeared
1015 // in all vertices.
1016 for (index, vertex) in best_mcc.iter().enumerate() {
1017 // get the underlying graph vertex for this
1018 // vertex in the mcc
1019 let vert = g.from_index((*vertex) as usize);
1020
1021 // the first element of the vertex tuple is the
1022 // equivalence class id
1023 let eqid = vert.0 as usize;
1024
1025 // if this is the first vertex
1026 if index == 0 {
1027 for lt in eqmap.refs_for_eqc(eqid as u32) {
1028 global_txps.insert(*lt);
1029 }
1030 } else {
1031 //crit!(log, "global txps = {:#?}\ncurr refs = {:#?}", global_txps, eqmap.refs_for_eqc(eqid as u32));
1032 let txps_for_vert = eqmap.refs_for_eqc(eqid as u32);
1033 global_txps.retain(|t| txps_for_vert.binary_search(t).is_ok());
1034 //for lt in eqmap.refs_for_eqc(eqid as u32) {
1035 // global_txps.remove(lt);
1036 //}
1037 }
1038 }
1039 // at this point, whatever transcript ids remain in
1040 // global_txps appear in all vertices of the mcc
1041
1042 //} // unsafe
1043
1044 // project each covering transcript to its
1045 // corresponding gene, and dedup the list
1046 let mut global_genes: Vec<u32> = if gene_level_eq_map {
1047 global_txps.iter().cloned().collect()
1048 } else {
1049 global_txps
1050 .iter()
1051 .cloned()
1052 .map(|i| tid_to_gid[i as usize])
1053 .collect()
1054 };
1055 // sort since we will be hashing the ordered vector
1056 global_genes.sort_unstable();
1057 // dedup as well since we don't care about duplicates
1058 global_genes.dedup();
1059
1060 pug_stats.total_mccs += 1;
1061 if global_genes.len() > 1 {
1062 pug_stats.ambiguous_mccs += 1;
1063 }
1064
1065 // assert the best covering gene in the global gene set
1066 assert!(
1067 global_genes.contains(&best_covering_gene),
1068 "best gene {} not in covering set, shouldn't be possible",
1069 best_covering_gene
1070 );
1071
1072 assert!(
1073 !global_genes.is_empty(),
1074 "can't find representative gene(s) for a molecule"
1075 );
1076
1077 // in our hash, increment the count of this equivalence class
1078 // by 1 (and insert it if we've not seen it yet).
1079 let counter = gene_eqclass_hash.entry(global_genes).or_insert(0);
1080 *counter += 1;
1081
1082 // for every vertext that has been covered
1083 // remove it from uncovered_vertices
1084 for rv in best_mcc.iter() {
1085 uncovered_vertices.remove(rv);
1086 }
1087 } //end-while
1088 } else {
1089 // this was a single-vertex subgraph
1090 let tv = comp_verts.first().expect("can't extract first vertex");
1091 let tl = eqmap.refs_for_eqc(g.from_index(*tv as usize).0);
1092
1093 if tl.len() == 1 {
1094 one_vertex_components[0] += 1;
1095 } else {
1096 one_vertex_components[1] += 1;
1097 }
1098
1099 let mut global_genes: Vec<u32>;
1100
1101 if gene_level_eq_map {
1102 global_genes = tl.to_vec();
1103 } else {
1104 global_genes = tl.iter().map(|i| tid_to_gid[*i as usize]).collect();
1105 global_genes.sort_unstable();
1106 global_genes.dedup();
1107 }
1108
1109 // extract gene-level eqclass and increment count by 1
1110 assert!(
1111 !global_genes.is_empty(),
1112 "can't find representative gene(s) for a molecule"
1113 );
1114
1115 pug_stats.total_mccs += 1;
1116 pug_stats.trivial_mccs += 1;
1117 if global_genes.len() > 1 {
1118 pug_stats.ambiguous_mccs += 1;
1119 }
1120 // incrementing the count of the eqclass label by 1
1121 let counter = gene_eqclass_hash.entry(global_genes).or_insert(0);
1122 *counter += 1;
1123 }
1124
1125 //let rand_cover = rand::thread_rng().choose(&tl)
1126 // .expect("can;t get random cover");
1127 //identified_txps.push(*rand_cover as u32);
1128 }
1129
1130 /*(gene_eqclass_hash,*/
1131 pug_stats
1132 //)
1133 /*
1134 let mut salmon_eqclasses = Vec::<SalmonEQClass>::new();
1135 for (key, val) in salmon_eqclass_hash {
1136 salmon_eqclasses.push(SalmonEQClass {
1137 labels: key,
1138 counts: val,
1139 });
1140 }
1141
1142 let mut unique_evidence: Vec<bool> = vec![false; gid_map.len()];
1143 let mut no_ambiguity: Vec<bool> = vec![true; gid_map.len()];
1144 if is_only_cell {
1145 info!("Total Networks: {}", comps.len());
1146 let num_txp_unique_networks: usize = one_vertex_components.iter().sum();
1147 let num_txp_ambiguous_networks: usize = comps.len() - num_txp_unique_networks;
1148 info!(
1149 ">1 vertices Network: {}, {}%",
1150 num_txp_ambiguous_networks,
1151 num_txp_ambiguous_networks as f32 * 100.0 / comps.len() as f32
1152 );
1153 info!(
1154 "1 vertex Networks w/ 1 txp: {}, {}%",
1155 one_vertex_components[0],
1156 one_vertex_components[0] as f32 * 100.0 / comps.len() as f32
1157 );
1158 info!(
1159 "1 vertex Networks w/ >1 txp: {}, {}%",
1160 one_vertex_components[1],
1161 one_vertex_components[1] as f32 * 100.0 / comps.len() as f32
1162 );
1163
1164 //info!("Total Predicted Molecules {}", identified_txps.len());
1165
1166 // iterate and extract gene names
1167 let mut gene_names: Vec<String> = vec!["".to_string(); gid_map.len()];
1168 for (gene_name, gene_idx) in gid_map {
1169 gene_names[*gene_idx as usize] = gene_name.clone();
1170 }
1171
1172 if num_bootstraps > 0 {
1173 //entry point for bootstrapping
1174 let gene_counts: Vec<Vec<f32>> = do_bootstrapping(salmon_eqclasses,
1175 &mut unique_evidence,
1176 &mut no_ambiguity,
1177 &num_bootstraps,
1178 gid_map.len(),
1179 only_unique);
1180
1181 write_bootstraps(gene_names, gene_counts, unique_evidence,
1182 no_ambiguity, num_bootstraps);
1183 return None;
1184 }
1185 else{
1186 //entry point for EM
1187 //println!("{:?}", subsample_gene_idx);
1188 //println!("{:?}", &salmon_eqclasses);
1189 let gene_counts: Vec<f32> = optimize(salmon_eqclasses, &mut unique_evidence,
1190 &mut no_ambiguity, gid_map.len(), only_unique);
1191
1192 write_quants(gene_names, gene_counts, unique_evidence, no_ambiguity);
1193 return None;
1194 } // end-else
1195 }
1196 else {
1197 Some(optimize(salmon_eqclasses,
1198 &mut unique_evidence,
1199 &mut no_ambiguity,
1200 gid_map.len(),
1201 only_unique
1202 ))
1203 }
1204 */
1205 //identified_txps
1206}