alevin_fry/
pugutils.rs

1/*
2 * Copyright (c) 2020-2024 COMBINE-lab.
3 *
4 * This file is part of alevin-fry
5 * (see https://www.github.com/COMBINE-lab/alevin-fry).
6 *
7 * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
8 */
9
10#[allow(unused_imports)]
11use ahash::{AHasher, RandomState};
12use arrayvec::ArrayVec;
13use smallvec::SmallVec;
14use std::cmp::Ordering;
15use std::collections::{HashMap, HashSet, VecDeque};
16use std::io;
17use std::io::Write;
18
19use petgraph::prelude::*;
20use petgraph::unionfind::*;
21use petgraph::visit::NodeIndexable;
22
23use libradicl::chunk;
24use libradicl::record::AlevinFryReadRecord;
25
26use slog::{crit, info, warn};
27
28use crate::eq_class::{EqMap, EqMapType};
29use crate::quant::SplicedAmbiguityModel;
30use crate::utils as afutils;
31
32type CcMap = HashMap<u32, Vec<u32>, ahash::RandomState>;
33
34#[derive(Debug)]
35pub enum PugEdgeType {
36    NoEdge,
37    BiDirected,
38    XToY,
39    YToX,
40}
41
42#[derive(Debug)]
43pub struct PugResolutionStatistics {
44    pub used_alternative_strategy: bool,
45    pub total_mccs: u64,
46    pub ambiguous_mccs: u64,
47    pub trivial_mccs: u64,
48}
49
50/// Extracts the parsimonious UMI graphs (PUGs) from the
51/// equivalence class map for a given cell.
52/// The returned graph is a directed graph (potentially with
53/// bidirected edges) where each node consists of an (equivalence
54/// class, UMI ID) pair.  Note, crucially, that the UMI ID is simply
55/// the rank of the UMI in the list of all distinct UMIs for this
56/// equivalence class.  There is a directed edge between any pair of
57/// vertices whose set of transcripts overlap and whose UMIs are within
58/// a Hamming distance of 1 of each other.  If one node has more than
59/// twice the frequency of the other, the edge is directed from the
60/// more frequent to the less freuqent node.  Otherwise, edges are
61/// added in both directions.
62pub fn extract_graph(
63    eqmap: &EqMap,
64    pug_exact_umi: bool, // true if only identical UMIs induce an edge
65    log: &slog::Logger,
66) -> petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed> {
67    let verbose = false;
68    let mut one_edit = 0u64;
69    let mut zero_edit = 0u64;
70
71    // given 2 pairs (UMI, count), determine if an edge exists
72    // between them, and if so, what type.
73    let mut has_edge = |x: &(u64, u32), y: &(u64, u32)| -> PugEdgeType {
74        let hdist = if pug_exact_umi {
75            if x.0 == y.0 {
76                0
77            } else {
78                usize::MAX
79            }
80        } else {
81            afutils::count_diff_2_bit_packed(x.0, y.0)
82        };
83
84        if hdist == 0 {
85            zero_edit += 1;
86            return PugEdgeType::BiDirected;
87        }
88
89        if hdist < 2 {
90            one_edit += 1;
91            return if x.1 > (2 * y.1 - 1) {
92                PugEdgeType::XToY
93            } else if y.1 > (2 * x.1 - 1) {
94                PugEdgeType::YToX
95            } else {
96                PugEdgeType::BiDirected
97            };
98        }
99        PugEdgeType::NoEdge
100    };
101
102    let mut _bidirected = 0u64;
103    let mut _unidirected = 0u64;
104
105    let mut graph = DiGraphMap::<(u32, u32), ()>::new();
106    let mut hset = vec![0u8; eqmap.num_eq_classes()];
107    let mut idxvec: SmallVec<[u32; 128]> = SmallVec::new();
108
109    // insert all of the nodes up front to avoid redundant
110    // checks later.
111    for eqid in 0..eqmap.num_eq_classes() {
112        // get the info Vec<(UMI, frequency)>
113        let eq = &eqmap.eqc_info[eqid];
114        let u1 = &eq.umis;
115        for (xi, _x) in u1.iter().enumerate() {
116            graph.add_node((eqid as u32, xi as u32));
117        }
118    }
119
120    // for every equivalence class in this cell
121    for eqid in 0..eqmap.num_eq_classes() {
122        if verbose && eqid % 1000 == 0 {
123            print!("\rprocessed {:?} eq classes", eqid);
124            io::stdout().flush().expect("Could not flush stdout");
125        }
126
127        // get the info Vec<(UMI, frequency)>
128        let eq = &eqmap.eqc_info[eqid];
129
130        // for each (umi, count) pair and its index
131        let u1 = &eq.umis;
132        for (xi, x) in u1.iter().enumerate() {
133            // add a node
134            // graph.add_node((eqid as u32, xi as u32));
135
136            // for each (umi, freq) pair and node after this one
137            for (xi2, x2) in u1.iter().enumerate().skip(xi + 1) {
138                //for xi2 in (xi + 1)..u1.len() {
139                // x2 is the other (umi, freq) pair
140                //let x2 = &u1[xi2];
141
142                // add a node for it
143                // graph.add_node((eqid as u32, xi2 as u32));
144
145                // determine if an edge exists between x and x2, and if so, what kind
146                let et = has_edge(x, x2);
147                // for each type of edge, add the appropriate edge in the graph
148                match et {
149                    PugEdgeType::BiDirected => {
150                        graph.add_edge((eqid as u32, xi as u32), (eqid as u32, xi2 as u32), ());
151                        graph.add_edge((eqid as u32, xi2 as u32), (eqid as u32, xi as u32), ());
152                        _bidirected += 1;
153                        //if multi_gene_vec[eqid] == true {
154                        //    bidirected_in_multigene += 1;
155                        //}
156                    }
157                    PugEdgeType::XToY => {
158                        graph.add_edge((eqid as u32, xi as u32), (eqid as u32, xi2 as u32), ());
159                        _unidirected += 1;
160                        //if multi_gene_vec[eqid] == true {
161                        //    unidirected_in_multigene += 1;
162                        //}
163                    }
164                    PugEdgeType::YToX => {
165                        graph.add_edge((eqid as u32, xi2 as u32), (eqid as u32, xi as u32), ());
166                        _unidirected += 1;
167                        //if multi_gene_vec[eqid] == true {
168                        //    unidirected_in_multigene += 1;
169                        //}
170                    }
171                    PugEdgeType::NoEdge => {}
172                }
173            }
174        }
175
176        //hset.clear();
177        //hset.resize(eqmap.num_eq_classes(), 0u8);
178        for i in &idxvec {
179            hset[*i as usize] = 0u8;
180        }
181        let stf = idxvec.len() > 128;
182        idxvec.clear();
183        if stf {
184            idxvec.shrink_to_fit();
185        }
186
187        // for every reference id in this eq class
188        for r in eqmap.refs_for_eqc(eqid as u32) {
189            // find the equivalence classes sharing this reference
190            for eq2id in eqmap.eq_classes_containing(*r).iter() {
191                // if eq2id <= eqid, then we already observed the relevant edges
192                // when we process eq2id
193                if (*eq2id as usize) <= eqid {
194                    continue;
195                }
196                // otherwise, if we have already processed this other equivalence
197                // class because it shares _another_ reference (apart from r) with
198                // the current equivalence class, then skip it.
199                if hset[*eq2id as usize] > 0 {
200                    continue;
201                }
202
203                // recall that we processed this eq class as a neighbor of eqid
204                hset[*eq2id as usize] = 1;
205                idxvec.push(*eq2id);
206                let eq2 = &eqmap.eqc_info[*eq2id as usize];
207
208                // compare all the umis between eqid and eq2id
209                let u2 = &eq2.umis;
210                for (xi, x) in u1.iter().enumerate() {
211                    // Node for equiv : eqid and umi : xi
212                    // graph.add_node((eqid as u32, xi as u32));
213
214                    for (yi, y) in u2.iter().enumerate() {
215                        // Node for equiv : eq2id and umi : yi
216                        // graph.add_node((*eq2id as u32, yi as u32));
217
218                        let et = has_edge(x, y);
219                        match et {
220                            PugEdgeType::BiDirected => {
221                                graph.add_edge((eqid as u32, xi as u32), (*eq2id, yi as u32), ());
222                                graph.add_edge((*eq2id, yi as u32), (eqid as u32, xi as u32), ());
223                                _bidirected += 1;
224                                //if multi_gene_vec[eqid] == true
225                                //    || multi_gene_vec[*eq2id as usize] == true
226                                //{
227                                //    bidirected_in_multigene += 1;
228                                //}
229                            }
230                            PugEdgeType::XToY => {
231                                graph.add_edge((eqid as u32, xi as u32), (*eq2id, yi as u32), ());
232                                _unidirected += 1;
233                                //if multi_gene_vec[eqid] == true
234                                //    || multi_gene_vec[*eq2id as usize] == true
235                                //{
236                                //    unidirected_in_multigene += 1;
237                                //}
238                            }
239                            PugEdgeType::YToX => {
240                                graph.add_edge((*eq2id, yi as u32), (eqid as u32, xi as u32), ());
241                                _unidirected += 1;
242                                //if multi_gene_vec[eqid] == true
243                                //    || multi_gene_vec[*eq2id as usize] == true
244                                //{
245                                //    unidirected_in_multigene += 1;
246                                //}
247                            }
248                            PugEdgeType::NoEdge => {}
249                        }
250                    }
251                }
252            }
253        }
254    }
255
256    if verbose {
257        info!(
258            log,
259            "\n\nsize of graph ({:?}, {:?})\n\n",
260            graph.node_count(),
261            graph.edge_count()
262        );
263        let total_edits = (one_edit + zero_edit) as f64;
264        info!(log, "\n\n\n{}\n\n\n", one_edit as f64 / total_edits);
265    }
266
267    graph
268}
269
270/// Extract the weakly connected components from the directed graph
271/// G.  Interestingly, `petgraph` has a builtin algorithm for returning
272/// the strongly-connected components of a digraph, and they have an
273/// algorithm for returning the _number_ of connected components of an
274/// undirected graph, but no algorithm for returning the actual
275/// connected components.  So, we build our own using their union
276/// find data structure.  This returns a HashMap, mapping each
277/// connected component id (a u32) to the corresponding list of vertex
278/// ids (also u32s) contained in the connected component.
279pub fn weakly_connected_components<G>(g: G) -> CcMap
280where
281    G: petgraph::visit::NodeCompactIndexable + petgraph::visit::IntoEdgeReferences,
282{
283    let mut vertex_sets = UnionFind::new(g.node_bound());
284    for edge in g.edge_references() {
285        let (a, b) = (edge.source(), edge.target());
286
287        // union the two vertices of the edge
288        vertex_sets.union(g.to_index(a), g.to_index(b));
289    }
290    let labels = vertex_sets.into_labeling();
291    fn get_map() -> CcMap {
292        let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
293        HashMap::with_hasher(s)
294    }
295
296    let mut components = get_map();
297    for (i, v) in labels.iter().enumerate() {
298        let ve = components.entry(*v as u32).or_default();
299        ve.push(i as u32);
300    }
301    components
302}
303
304/// Find the largest monochromatic spanning arboresence
305/// in the graph `g` starting at vertex `v`.  The arboresence
306/// is monochromatic if every vertex can be "covered" by a single
307/// transcript (i.e. there exists a transcript that appears in the
308/// equivalence class labels of all vertices in the arboresence).
309fn collapse_vertices(
310    v: u32,
311    uncovered_vertices: &HashSet<u32, ahash::RandomState>, // the set of vertices already covered
312    g: &petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed>,
313    eqmap: &EqMap,
314    hasher_state: &ahash::RandomState,
315) -> (Vec<u32>, u32) {
316    // get a new set to hold vertices
317    type VertexSet = HashSet<u32, ahash::RandomState>;
318    let get_set =
319        |cap: u32| VertexSet::with_capacity_and_hasher(cap as usize, hasher_state.clone());
320
321    // will hold the nodes in the largest arboresence found
322    let mut largest_mcc: Vec<u32> = Vec::new();
323    let mut chosen_txp = 0u32;
324    let vert = g.from_index(v as usize);
325
326    //unsafe {
327
328    let nvert = g.node_count();
329
330    // for every transcript in the equivalence class
331    // label of the vertex
332    for txp in eqmap.refs_for_eqc(vert.0).iter() {
333        // start a bfs from this vertex
334        let mut bfs_list = VecDeque::new();
335        bfs_list.push_back(v);
336
337        // the set to remember the nodes we've already
338        // visited
339        let mut visited_set = get_set(nvert as u32);
340        visited_set.insert(v);
341
342        // will hold the current arboresence we
343        // are constructing
344        let mut current_mcc = Vec::new();
345
346        // get the next vertex in the BFS
347        while let Some(cv) = bfs_list.pop_front() {
348            // add it to the arboresence
349            current_mcc.push(cv);
350
351            // for all of the neighboring vertices that we can
352            // reach (those with outgoing, or bidirected edges)
353            for nv in g.neighbors_directed(g.from_index(cv as usize), Outgoing) {
354                let n = g.to_index(nv) as u32;
355
356                // check if we should add this vertex or not:
357                // uncovered_vertices contains the the current set of
358                // *uncovered* vertices in this component (i.e. those
359                // that we still need to explain by some molecule).
360                // so, if n is *not* in uncovered_vertices, then it is not in the
361                // uncovered set, and so it has already been
362                // explained / covered.
363                //
364                // if n hasn't been covered yet, then
365                // check if we've seen n in this traversal
366                // yet. The `insert()` method returns true
367                // if the set didn't have the element, false
368                // otherwise.
369                if !uncovered_vertices.contains(&n) || !visited_set.insert(n) {
370                    continue;
371                }
372
373                // get the set of transcripts present in the
374                // label of the current node.
375                let n_labels = eqmap.refs_for_eqc(nv.0);
376                if let Ok(_n) = n_labels.binary_search(txp) {
377                    bfs_list.push_back(n);
378                }
379            }
380        }
381
382        // if this arboresence is the largest we've yet
383        // seen, then record it
384        if largest_mcc.len() < current_mcc.len() {
385            largest_mcc = current_mcc;
386            chosen_txp = *txp;
387        }
388    }
389    //}// unsafe
390
391    (largest_mcc, chosen_txp)
392}
393
394#[inline]
395fn resolve_num_molecules_crlike_from_vec_prefer_ambig(
396    umi_gene_count_vec: &mut [(u64, u32, u32)],
397    gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
398) {
399    // sort the triplets
400    // first on umi
401    // then on gene_id
402    // then on count
403    umi_gene_count_vec.sort_unstable();
404
405    // hold the current umi and gene we are examining
406    let mut curr_umi = umi_gene_count_vec.first().expect("cell with no UMIs").0;
407    let first_gn = umi_gene_count_vec.first().expect("cell with no UMIs").1;
408    // The capacity of curr_gn is 2 as it will be used to hold the
409    // the spliced id of a gene, the unspliced id of a gene, or both
410    let mut curr_gn = ArrayVec::<u32, 2>::new();
411    curr_gn.push(first_gn);
412
413    // hold the gene id having the max count for this umi
414    // and the maximum count value itself
415    let mut max_count = 0u32;
416    // to aggregate the count should a (umi, gene) pair appear
417    // more than once
418    let mut count_aggr = 0u32;
419    // the vector will hold the equivalent set of best genes
420    let mut best_genes = Vec::<u32>::with_capacity(16);
421
422    // look over all sorted triplets
423    for (cidx, &(umi, gn, ct)) in umi_gene_count_vec.iter().enumerate() {
424        // if this umi is different than
425        // the one we are processing
426        // then decide what action to take
427        // on the previous umi
428        if umi != curr_umi {
429            // update the count of the equivalence class of genes
430            // that gets this UMI
431
432            *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
433
434            // the next umi and gene
435            curr_umi = umi;
436            curr_gn.clear();
437            curr_gn.push(gn);
438
439            // current gene is current best
440            best_genes.clear();
441            best_genes.push(gn);
442
443            // count aggr = max count = ct
444            count_aggr = ct;
445            max_count = ct;
446        } else {
447            // the umi was the same
448
449            let prev_gid = *curr_gn.last().expect("not empty");
450
451            // if the gene is the same (modulo splicing), add the counts
452            if afutils::same_gene(gn, prev_gid, true) {
453                // if the gene is the same modulo splicing, but
454                // curr_gn != gn, then this is gene will be set as ambiguous,
455                // this is the transition from counting occurrences
456                // of this UMI from the spliced to unspliced version
457                // of the gene.
458                if prev_gid != gn {
459                    // mark the current gene as
460                    // splicing ambiguous by pushing
461                    // the unspliced id onto curr_gn
462                    curr_gn.push(gn);
463                }
464                count_aggr += ct;
465            } else {
466                // if the gene is different, then restart the count_aggr
467                // and set the current gene id
468                count_aggr = ct;
469                curr_gn.clear();
470                curr_gn.push(gn);
471            }
472
473            // we have the following cases, consider we are
474            // processing the records for gene g_i.  If
475            // * curr_gn = [g_i^s], then we have so far only observed spliced reads for g_i
476            // * curr_gn = [g_i^u], then we have only observed unspliced reads for g_i
477            //   (and there are no spliced reads)
478            // * curr_gn = [g_i^s, g_i^u], then we have observed both spliced and unspliced
479            //   reads for g_i, and this gene will be considered splicing ambiguous.
480
481            // the current best_genes is either empty or contains some
482            // set of genes g_i-k, ..., g_i-1, and now,
483            // g_i matches their count.  If g_i has only observed spliced reads,
484            // then we will add g_i^s to the list.  If g_i has observed
485            // only unspliced reads then we will add g_i^u, otherwise
486            // we will add both g_i^s and g_i^u.
487
488            // the current best_genes is either emtpy or contains some
489            // set of genes g_i-k, ..., g_i-1, and now,
490            // g_i *exceeds* their count.  If g_i has only observed spliced reads,
491            // then we will *clear* best_genes, and replace it with g_i^s.
492            // If g_i has only observed unspliced reads, then we will *clear*
493            // best_genes and replace it with g_i^u.  Othewise we will *clear*
494            // best_genes and replace it with [g_i^s, g_i^u];
495
496            // if the count aggregator exceeded the max
497            // then it is the new max, and this gene is
498            // the new max gene.  Having a distinct max
499            // also makes this UMI uniquely resolvable
500            match count_aggr.cmp(&max_count) {
501                Ordering::Greater => {
502                    max_count = count_aggr;
503                    best_genes.clear();
504                    best_genes.extend(curr_gn.iter());
505                }
506                Ordering::Equal => {
507                    // if we have a tie for the max count
508                    // then the current UMI isn't uniquely-unresolvable
509                    // it will stay this way unless we see a bigger
510                    // count for this UMI.  We add the current
511                    // "tied" gene to the equivalence class.
512                    best_genes.extend(curr_gn.iter());
513                }
514                Ordering::Less => {
515                    // we do nothing
516                }
517            }
518        }
519
520        // if this was the last UMI in the list
521        if cidx == umi_gene_count_vec.len() - 1 {
522            *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
523        }
524    }
525}
526
527#[inline]
528fn resolve_num_molecules_crlike_from_vec(
529    umi_gene_count_vec: &mut [(u64, u32, u32)],
530    gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
531) {
532    // sort the triplets
533    // first on umi
534    // then on gene_id
535    // then on count
536    umi_gene_count_vec.sort_unstable();
537
538    // hold the current umi and gene we are examining
539    let mut curr_umi = umi_gene_count_vec.first().expect("cell with no UMIs").0;
540    let mut curr_gn = umi_gene_count_vec.first().expect("cell with no UMIs").1;
541    // hold the gene id having the max count for this umi
542    // and the maximum count value itself
543    // let mut max_count_gene = 0u32;
544    let mut max_count = 0u32;
545    // to aggregate the count should a (umi, gene) pair appear
546    // more than once
547    let mut count_aggr = 0u32;
548    // the vector will hold the equivalent set of best genes
549    let mut best_genes = Vec::<u32>::with_capacity(16);
550
551    // look over all sorted triplets
552    for (cidx, &(umi, gn, ct)) in umi_gene_count_vec.iter().enumerate() {
553        // if this umi is different than
554        // the one we are processing
555        // then decide what action to take
556        // on the previous umi
557        if umi != curr_umi {
558            // update the count of the equivalence class of genes
559            // that gets this UMI
560            *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
561
562            // the next umi and gene
563            curr_umi = umi;
564            curr_gn = gn;
565
566            // current gene is current best
567            // max_count_gene = gn;
568            best_genes.clear();
569            best_genes.push(gn);
570
571            // count aggr = max count = ct
572            count_aggr = ct;
573            max_count = ct;
574        } else {
575            // the umi was the same
576
577            // if the gene is the same, add the counts
578            if gn == curr_gn {
579                count_aggr += ct;
580            } else {
581                // if the gene is different, then restart the count_aggr
582                // and set the current gene id
583                count_aggr = ct;
584                curr_gn = gn;
585            }
586            // if the count aggregator exceeded the max
587            // then it is the new max, and this gene is
588            // the new max gene.  Having a distinct max
589            // also makes this UMI uniquely resolvable
590            match count_aggr.cmp(&max_count) {
591                Ordering::Greater => {
592                    max_count = count_aggr;
593                    // we want to avoid the case that we are just
594                    // updating the count of the best gene above and
595                    // here we clear out the vector and populate it
596                    // with the same element again and again.  So
597                    // if the current best_genes vector holds just
598                    // gn, we do nothing.  Otherwise we clear it and
599                    // add gn.
600                    match &best_genes[..] {
601                        [x] if *x == gn => { /* do nothing here */ }
602                        _ => {
603                            best_genes.clear();
604                            best_genes.push(gn);
605                        }
606                    }
607                }
608                Ordering::Equal => {
609                    // if we have a tie for the max count
610                    // then the current UMI isn't uniquely-unresolvable
611                    // it will stay this way unless we see a bigger
612                    // count for this UMI.  We add the current
613                    // "tied" gene to the equivalence class.
614                    best_genes.push(gn);
615                }
616                Ordering::Less => {
617                    // we do nothing
618                }
619            }
620        }
621
622        // if this was the last UMI in the list
623        if cidx == umi_gene_count_vec.len() - 1 {
624            *gene_eqclass_hash.entry(best_genes.clone()).or_insert(0) += 1;
625        }
626    }
627}
628
629pub fn get_num_molecules_cell_ranger_like_small(
630    cell_chunk: &mut chunk::Chunk<AlevinFryReadRecord>,
631    tid_to_gid: &[u32],
632    _num_genes: usize,
633    gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
634    sa_model: SplicedAmbiguityModel,
635    _log: &slog::Logger,
636) {
637    let mut umi_gene_count_vec: Vec<(u64, u32, u32)> = Vec::with_capacity(cell_chunk.nrec as usize);
638
639    // for each record
640    for rec in &cell_chunk.reads {
641        // get the umi
642        let umi = rec.umi;
643
644        // project the transcript ids to gene ids
645        let mut gset: Vec<u32> = rec
646            .refs
647            .iter()
648            .map(|tid| tid_to_gid[*tid as usize])
649            .collect();
650        // and make the gene ids unique
651        gset.sort_unstable();
652        gset.dedup();
653        for g in &gset {
654            umi_gene_count_vec.push((umi, *g, 1));
655        }
656    }
657    match sa_model {
658        SplicedAmbiguityModel::WinnerTakeAll => {
659            resolve_num_molecules_crlike_from_vec(&mut umi_gene_count_vec, gene_eqclass_hash);
660        }
661        SplicedAmbiguityModel::PreferAmbiguity => {
662            resolve_num_molecules_crlike_from_vec_prefer_ambig(
663                &mut umi_gene_count_vec,
664                gene_eqclass_hash,
665            );
666        }
667    }
668}
669
670pub fn get_num_molecules_cell_ranger_like(
671    eq_map: &EqMap,
672    tid_to_gid: &[u32],
673    _num_genes: usize,
674    gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
675    sa_model: SplicedAmbiguityModel,
676    _log: &slog::Logger,
677) {
678    // TODO: better capacity
679    let mut umi_gene_count_vec: Vec<(u64, u32, u32)> = vec![];
680
681    // for each equivalence class
682    for eqinfo in &eq_map.eqc_info {
683        // get the (umi, count) pairs
684        let umis = &eqinfo.umis;
685        let eqid = &eqinfo.eq_num;
686
687        // project the transcript ids to gene ids
688        let mut gset: Vec<u32> = eq_map
689            .refs_for_eqc(*eqid)
690            .iter()
691            .map(|tid| tid_to_gid[*tid as usize])
692            .collect();
693        // and make the gene ids unique,
694        // note, if we have both spliced and
695        // unspliced gene ids, then they will
696        // necessarily be adjacent here, since
697        // they are always asigned adjacent ids
698        // with spliced being even and unspliced odd.
699        gset.sort_unstable();
700        gset.dedup();
701
702        // add every (umi, count), gene pair as a triplet
703        // of (umi, gene_id, count) to the output vector
704        for umi_ct in umis {
705            for g in &gset {
706                umi_gene_count_vec.push((umi_ct.0, *g, umi_ct.1));
707            }
708        }
709    }
710    match sa_model {
711        SplicedAmbiguityModel::WinnerTakeAll => {
712            resolve_num_molecules_crlike_from_vec(&mut umi_gene_count_vec, gene_eqclass_hash);
713        }
714        SplicedAmbiguityModel::PreferAmbiguity => {
715            resolve_num_molecules_crlike_from_vec_prefer_ambig(
716                &mut umi_gene_count_vec,
717                gene_eqclass_hash,
718            );
719        }
720    }
721}
722
723pub fn get_num_molecules_trivial_discard_all_ambig(
724    eq_map: &EqMap,
725    tid_to_gid: &[u32],
726    num_genes: usize,
727    _log: &slog::Logger,
728) -> (Vec<f32>, f64) {
729    let mut counts = vec![0.0f32; num_genes];
730    let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
731    let mut gene_map: std::collections::HashMap<u32, Vec<u64>, ahash::RandomState> =
732        HashMap::with_hasher(s);
733
734    let mut total_umis = 0u64;
735    let mut multi_gene_umis = 0u64;
736
737    for eqinfo in &eq_map.eqc_info {
738        let umis = &eqinfo.umis;
739        let eqid = &eqinfo.eq_num;
740        let tset = eq_map.refs_for_eqc(*eqid);
741        let mut prev_gene_id = u32::MAX;
742        let mut multi_gene = false;
743        // if this is a single-gene equivalence class
744        // then go ahead and assign the read
745        for t in tset {
746            let gid = tid_to_gid[*t as usize];
747            if gid != prev_gene_id && prev_gene_id < u32::MAX {
748                multi_gene = true;
749                break;
750            }
751            prev_gene_id = gid;
752        }
753
754        total_umis += umis.len() as u64;
755        if multi_gene {
756            multi_gene_umis += umis.len() as u64;
757        }
758
759        // if the read is single-gene
760        // then add this equivalence class' list
761        // of UMIs in the gene map
762        if !multi_gene {
763            gene_map
764                .entry(prev_gene_id)
765                .or_default()
766                .extend(umis.iter().map(|x| x.0));
767        }
768    }
769
770    // go over the map and merge umis from different
771    // equivalence classes that still map to the same
772    // gene.
773    for (k, v) in gene_map.iter_mut() {
774        v.sort_unstable();
775        v.dedup();
776        // the count is the number of distinct UMIs.
777        counts[*k as usize] += v.len() as f32;
778    }
779
780    // return the counts
781    (counts, multi_gene_umis as f64 / total_umis as f64)
782}
783
784/// given the connected component (subgraph) of `g` defined by the
785/// vertices in `vertex_ids`, apply the cell-ranger-like algorithm
786/// within this subgraph.
787fn get_num_molecules_large_component(
788    g: &petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed>,
789    eq_map: &EqMap,
790    vertex_ids: &[u32],
791    tid_to_gid: &[u32],
792    hasher_state: &ahash::RandomState,
793    gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
794    _log: &slog::Logger,
795) {
796    let gene_level_eq_map = match eq_map.map_type {
797        EqMapType::GeneLevel => true,
798        EqMapType::TranscriptLevel => false,
799    };
800
801    // TODO: better capacity
802    let mut umi_gene_count_vec: Vec<(u64, u32, u32)> = vec![];
803
804    // build a temporary hashmap from each
805    // equivalence class id in the current subgraph
806    // to the set of (UMI, frequency) pairs contained
807    // in the subgraph
808    //let ts = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
809    let mut tmp_map =
810        HashMap::<u32, Vec<(u64, u32)>, ahash::RandomState>::with_hasher(hasher_state.clone());
811
812    // for each vertex id in the subgraph
813    for vertex_id in vertex_ids {
814        // get the corresponding vertex which is
815        // an (eq_id, UMI index) pair
816        let vert = g.from_index(*vertex_id as usize);
817        // add the corresponding (UMI, frequency) pair to the map
818        // for this eq_id
819        let umis = tmp_map.entry(vert.0).or_default();
820        umis.push(eq_map.eqc_info[vert.0 as usize].umis[vert.1 as usize]);
821    }
822
823    for (k, v) in tmp_map.iter() {
824        // get the (umi, count) pairs
825        let umis = v; //&eqinfo.umis;
826        let eqid = k; //&eqinfo.eq_num;
827                      // project the transcript ids to gene ids
828        let mut gset: Vec<u32>;
829
830        if gene_level_eq_map {
831            gset = eq_map.refs_for_eqc(*eqid).to_vec();
832        } else {
833            gset = eq_map
834                .refs_for_eqc(*eqid)
835                .iter()
836                .map(|tid| tid_to_gid[*tid as usize])
837                .collect();
838            // and make the gene ids unique
839            gset.sort_unstable();
840            gset.dedup();
841        }
842
843        // add every (umi, count), gene pair as a triplet
844        // of (umi, gene_id, count) to the output vector
845        for umi_ct in umis {
846            for g in &gset {
847                umi_gene_count_vec.push((umi_ct.0, *g, umi_ct.1));
848            }
849        }
850    }
851
852    resolve_num_molecules_crlike_from_vec(&mut umi_gene_count_vec, gene_eqclass_hash);
853}
854
855/// Given the digraph `g` representing the PUGs within the current
856/// cell, the EqMap `eqmap` to decode all equivalence classes
857/// and the transcript-to-gene map `tid_to_gid`, apply the parsimonious
858/// umi resolution algorithm.  Pass any relevant logging messages along to
859/// `log`.
860pub fn get_num_molecules(
861    g: &petgraph::graphmap::GraphMap<(u32, u32), (), petgraph::Directed>,
862    eqmap: &EqMap,
863    tid_to_gid: &[u32],
864    gene_eqclass_hash: &mut HashMap<Vec<u32>, u32, ahash::RandomState>,
865    hasher_state: &ahash::RandomState,
866    large_graph_thresh: usize,
867    log: &slog::Logger,
868) -> PugResolutionStatistics
869//,)
870{
871    type U32Set = HashSet<u32, ahash::RandomState>;
872    let get_set = |cap: u32| {
873        //let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
874        U32Set::with_capacity_and_hasher(cap as usize, hasher_state.clone())
875    };
876
877    let gene_level_eq_map = match eqmap.map_type {
878        EqMapType::GeneLevel => true,
879        EqMapType::TranscriptLevel => false,
880    };
881
882    let comps = weakly_connected_components(g);
883    // a vector of length 2 that records at index 0
884    // the number of single-node subgraphs that are
885    // transcript-unique and at index 1 the number of
886    // single-node subgraphs that have more than one
887    // associated transcript.
888    let mut one_vertex_components: Vec<usize> = vec![0, 0];
889
890    // Make gene-level eqclasses.
891    // This is a map of gene ids to the count of
892    // _de-duplicated_ reads observed for that set of genes.
893    // For every gene set (label) of length 1, these are gene
894    // unique reads.  Standard scRNA-seq counting results
895    // can be obtained by simply discarding all equivalence
896    // classes of size greater than 1, and probabilistic results
897    // will attempt to resolve gene multi-mapping reads by
898    // running and EM algorithm.
899    //let s = fasthash::RandomState::<Hash64>::new();
900    //let mut gene_eqclass_hash: HashMap<Vec<u32>, u32, fasthash::RandomState<Hash64>> =
901    //    HashMap::with_hasher(s);
902
903    // Get the genes that could potentially explain all
904    // of the vertices in this mcc.
905    // To do this, we first extract the set of _transcripts_
906    // that label all vertices of the mcc, and then we project
907    // the transcripts to their corresponding gene ids.
908    //let mut global_txps : Vec<u32>;
909    let mut global_txps = get_set(16);
910    let mut pug_stats = PugResolutionStatistics {
911        used_alternative_strategy: false,
912        total_mccs: 0u64,
913        ambiguous_mccs: 0u64,
914        trivial_mccs: 0u64,
915    };
916
917    for (_comp_label, comp_verts) in comps.iter() {
918        if comp_verts.len() > 1 {
919            // the current parsimony resolution algorithm
920            // can become slow for connected components that
921            // are very large.  For components with > large_graph_thresh
922            // vertices (this should be _very_ rare) we will instead
923            // resolve the UMIs in the component using a simpler algorithm.
924            if comp_verts.len() > large_graph_thresh {
925                get_num_molecules_large_component(
926                    g,
927                    eqmap,
928                    comp_verts,
929                    tid_to_gid,
930                    hasher_state,
931                    gene_eqclass_hash,
932                    log,
933                );
934                warn!(
935                    log,
936                    "found connected component with {} vertices; resolved with cr-like resolution.",
937                    comp_verts.len(),
938                );
939                pug_stats.used_alternative_strategy = true;
940                continue;
941            }
942
943            // uncovered_vertices will hold the set of vertices that are
944            // *not yet* covered.
945            //
946            // Non-deterministic variant :
947            // NOTE: The line below places the vertices into a HashSet that uses
948            // a hasher with a RandomState which, by default, Rust will randomize between
949            // runs.  That means that the output of the entire algorithm will, in general,
950            // not be deterministic.  By using a RandomState with fixed seeds, this can
951            // be made deterministic (see below), but it is unclear if this will increase
952            // bias of resolving components in favor of certain transcripts (and therefore genes)
953            // that tend to appear first in hash iteration order.
954            // let mut uncovered_vertices = comp_verts.iter().cloned().collect::<HashSet<u32, ahash::RandomState>>();
955
956            // Deterministic variant : replacing the above line with these two lines will
957            // cause the parsimony resolution to be deterministic, but potentially at the
958            // cost of increasing bias.
959            let mut uncovered_vertices = get_set(comp_verts.len() as u32);
960            for v in comp_verts.iter().cloned() {
961                uncovered_vertices.insert(v);
962            }
963
964            // we will remove covered vertices from uncovered_vertices until they are
965            // all gone (until all vertices have been covered)
966            while !uncovered_vertices.is_empty() {
967                let num_remaining = uncovered_vertices.len();
968                // will hold vertices in the best mcc
969                let mut best_mcc: Vec<u32> = Vec::new();
970                // the transcript that is responsible for the
971                // best mcc covering
972                let mut best_covering_txp = u32::MAX;
973                // for each vertex in the vertex set
974                for v in uncovered_vertices.iter() {
975                    // find the largest mcc starting from this vertex
976                    // and the transcript that covers it
977                    // NOTE: what if there are multiple different mccs that
978                    // are equally good? (@k3yavi — I don't think this case
979                    // is even handled in the C++ code either).
980                    let (new_mcc, covering_txp) =
981                        collapse_vertices(*v, &uncovered_vertices, g, eqmap, hasher_state);
982
983                    let mcc_len = new_mcc.len();
984                    // if the new mcc is better than the current best, then
985                    // it becomes the new best
986                    if best_mcc.len() < mcc_len {
987                        best_mcc = new_mcc;
988                        best_covering_txp = covering_txp;
989                    }
990                    // we can't do better than covering all
991                    // remaining uncovered vertices.  So, if we
992                    // accomplish that, then quit here.
993                    if mcc_len == num_remaining {
994                        break;
995                    }
996                }
997
998                if best_covering_txp == u32::MAX {
999                    crit!(log, "Could not find a covering transcript");
1000                    std::process::exit(1);
1001                }
1002
1003                // get gene_id of best covering transcript
1004                let best_covering_gene = if gene_level_eq_map {
1005                    best_covering_txp
1006                } else {
1007                    tid_to_gid[best_covering_txp as usize]
1008                };
1009
1010                //unsafe {
1011                global_txps.clear();
1012                // We iterate over all vertices in the mcc, and for
1013                // each one, we keep track of the (monotonically
1014                // non-increasing) set of transcripts that have appeared
1015                // in all vertices.
1016                for (index, vertex) in best_mcc.iter().enumerate() {
1017                    // get the underlying graph vertex for this
1018                    // vertex in the mcc
1019                    let vert = g.from_index((*vertex) as usize);
1020
1021                    // the first element of the vertex tuple is the
1022                    // equivalence class id
1023                    let eqid = vert.0 as usize;
1024
1025                    // if this is the first vertex
1026                    if index == 0 {
1027                        for lt in eqmap.refs_for_eqc(eqid as u32) {
1028                            global_txps.insert(*lt);
1029                        }
1030                    } else {
1031                        //crit!(log, "global txps = {:#?}\ncurr refs = {:#?}", global_txps, eqmap.refs_for_eqc(eqid as u32));
1032                        let txps_for_vert = eqmap.refs_for_eqc(eqid as u32);
1033                        global_txps.retain(|t| txps_for_vert.binary_search(t).is_ok());
1034                        //for lt in eqmap.refs_for_eqc(eqid as u32) {
1035                        //    global_txps.remove(lt);
1036                        //}
1037                    }
1038                }
1039                // at this point, whatever transcript ids remain in
1040                // global_txps appear in all vertices of the mcc
1041
1042                //} // unsafe
1043
1044                // project each covering transcript to its
1045                // corresponding gene, and dedup the list
1046                let mut global_genes: Vec<u32> = if gene_level_eq_map {
1047                    global_txps.iter().cloned().collect()
1048                } else {
1049                    global_txps
1050                        .iter()
1051                        .cloned()
1052                        .map(|i| tid_to_gid[i as usize])
1053                        .collect()
1054                };
1055                // sort since we will be hashing the ordered vector
1056                global_genes.sort_unstable();
1057                // dedup as well since we don't care about duplicates
1058                global_genes.dedup();
1059
1060                pug_stats.total_mccs += 1;
1061                if global_genes.len() > 1 {
1062                    pug_stats.ambiguous_mccs += 1;
1063                }
1064
1065                // assert the best covering gene in the global gene set
1066                assert!(
1067                    global_genes.contains(&best_covering_gene),
1068                    "best gene {} not in covering set, shouldn't be possible",
1069                    best_covering_gene
1070                );
1071
1072                assert!(
1073                    !global_genes.is_empty(),
1074                    "can't find representative gene(s) for a molecule"
1075                );
1076
1077                // in our hash, increment the count of this equivalence class
1078                // by 1 (and insert it if we've not seen it yet).
1079                let counter = gene_eqclass_hash.entry(global_genes).or_insert(0);
1080                *counter += 1;
1081
1082                // for every vertext that has been covered
1083                // remove it from uncovered_vertices
1084                for rv in best_mcc.iter() {
1085                    uncovered_vertices.remove(rv);
1086                }
1087            } //end-while
1088        } else {
1089            // this was a single-vertex subgraph
1090            let tv = comp_verts.first().expect("can't extract first vertex");
1091            let tl = eqmap.refs_for_eqc(g.from_index(*tv as usize).0);
1092
1093            if tl.len() == 1 {
1094                one_vertex_components[0] += 1;
1095            } else {
1096                one_vertex_components[1] += 1;
1097            }
1098
1099            let mut global_genes: Vec<u32>;
1100
1101            if gene_level_eq_map {
1102                global_genes = tl.to_vec();
1103            } else {
1104                global_genes = tl.iter().map(|i| tid_to_gid[*i as usize]).collect();
1105                global_genes.sort_unstable();
1106                global_genes.dedup();
1107            }
1108
1109            // extract gene-level eqclass and increment count by 1
1110            assert!(
1111                !global_genes.is_empty(),
1112                "can't find representative gene(s) for a molecule"
1113            );
1114
1115            pug_stats.total_mccs += 1;
1116            pug_stats.trivial_mccs += 1;
1117            if global_genes.len() > 1 {
1118                pug_stats.ambiguous_mccs += 1;
1119            }
1120            // incrementing the count of the eqclass label by 1
1121            let counter = gene_eqclass_hash.entry(global_genes).or_insert(0);
1122            *counter += 1;
1123        }
1124
1125        //let rand_cover = rand::thread_rng().choose(&tl)
1126        //    .expect("can;t get random cover");
1127        //identified_txps.push(*rand_cover as u32);
1128    }
1129
1130    /*(gene_eqclass_hash,*/
1131    pug_stats
1132    //)
1133    /*
1134    let mut salmon_eqclasses = Vec::<SalmonEQClass>::new();
1135    for (key, val) in salmon_eqclass_hash {
1136    salmon_eqclasses.push(SalmonEQClass {
1137        labels: key,
1138        counts: val,
1139    });
1140    }
1141
1142    let mut unique_evidence: Vec<bool> = vec![false; gid_map.len()];
1143    let mut no_ambiguity: Vec<bool> = vec![true; gid_map.len()];
1144    if is_only_cell {
1145    info!("Total Networks: {}", comps.len());
1146    let num_txp_unique_networks: usize = one_vertex_components.iter().sum();
1147    let num_txp_ambiguous_networks: usize = comps.len() - num_txp_unique_networks;
1148    info!(
1149        ">1 vertices Network: {}, {}%",
1150        num_txp_ambiguous_networks,
1151        num_txp_ambiguous_networks as f32 * 100.0 / comps.len() as f32
1152    );
1153    info!(
1154        "1 vertex Networks w/ 1 txp: {}, {}%",
1155        one_vertex_components[0],
1156        one_vertex_components[0] as f32 * 100.0 / comps.len() as f32
1157    );
1158    info!(
1159        "1 vertex Networks w/ >1 txp: {}, {}%",
1160        one_vertex_components[1],
1161        one_vertex_components[1] as f32 * 100.0 / comps.len() as f32
1162    );
1163
1164    //info!("Total Predicted Molecules {}", identified_txps.len());
1165
1166    // iterate and extract gene names
1167    let mut gene_names: Vec<String> = vec!["".to_string(); gid_map.len()];
1168    for (gene_name, gene_idx) in gid_map {
1169        gene_names[*gene_idx as usize] = gene_name.clone();
1170    }
1171
1172    if num_bootstraps > 0 {
1173        //entry point for bootstrapping
1174        let gene_counts: Vec<Vec<f32>> = do_bootstrapping(salmon_eqclasses,
1175        &mut unique_evidence,
1176        &mut no_ambiguity,
1177        &num_bootstraps,
1178        gid_map.len(),
1179        only_unique);
1180
1181        write_bootstraps(gene_names, gene_counts, unique_evidence,
1182        no_ambiguity, num_bootstraps);
1183        return None;
1184    }
1185    else{
1186        //entry point for EM
1187        //println!("{:?}", subsample_gene_idx);
1188        //println!("{:?}", &salmon_eqclasses);
1189        let gene_counts: Vec<f32> = optimize(salmon_eqclasses, &mut unique_evidence,
1190        &mut no_ambiguity, gid_map.len(), only_unique);
1191
1192        write_quants(gene_names, gene_counts, unique_evidence, no_ambiguity);
1193        return None;
1194    } // end-else
1195    }
1196    else {
1197    Some(optimize(salmon_eqclasses,
1198        &mut unique_evidence,
1199        &mut no_ambiguity,
1200        gid_map.len(),
1201        only_unique
1202    ))
1203    }
1204    */
1205    //identified_txps
1206}
alevin_fry/pugutils.rs

alevin_fry/
pugutils.rs