bustools 0.14.0

Interacting with the kallisto/bus format of scRNAseq data
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
//! This module handles the Equivalance class to gene mapping
//!
//! An EC represents a *set* of transcripts which the read is consistent with.
//! This is then mapped to genes:
//! * the EC resolves to a single gene, clear case for a count
//! * the EC resolves to multipel genes (the read aligned to a part of the transcriptome which is ambigous)
//!
//! It gets more complicated if we have multiple ECs for an mRNA,
//! i.e. multiple busrecords with same CB/UMI, but different EC
//! This happens since mRNAs get fragmented after amplification, and different fragments can map to different parts \
//! of the transcriptome, hence yielding different ECs
//!
//! See [Ec2GeneMapper] and [MappingResult]
//!
use crate::consistent_transcripts::{Ec2TranscriptMapper, TranscriptId, Transcriptname};
use crate::io::BusFolder;
use crate::{disjoint::Intersector, io::BusRecord};
use itertools::izip;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::hash::Hash;
use std::io::{BufReader, BufRead};

/*
just some wrappers for Strings and ints that we use for genes and equivalence classes
 */
/// Thin wrapper aroud u32, representing an Equivalence Class
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Copy, Clone, Debug)]
pub struct EC(pub u32);

/// Gene identifier
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Copy, Clone, Debug)]
pub struct GeneId(pub u32);

/// Thin wrapper around u64, a cell-barcode
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Copy, Clone, Debug)]
pub struct CB(pub u64);

/// Name of a particular gene
#[derive(Eq, PartialEq, Hash, Ord, PartialOrd, Clone, Debug)]
pub struct Genename(pub String);

/// intersecting both sets, modifying the first one (it'll stay the same or loose elements)
pub (crate) fn update_intersection_via_retain<T: Hash + Eq>(inter: &mut HashSet<T>, newset: &HashSet<T>) {
    // delete any elemt in shared_genes not present in current_set
    // i..e set intersection
    // we cant delete while iterating, so remember which elements to delete
    inter.retain(|item| newset.contains(item));
}

/// MappingResult represents an attempt to unify several BusRecords with matching CB/UMI
/// into an mRNA expressed from a single gene.
///
/// This is not always possible, as the records will have different EC
/// Sometimes the ECs are inconsistent (no overlap in gene space)
/// Sometimes the ECS are ambigous and can resolve to multipel genes
/// TODO: MappingResult isnt the best name (mapping sounds like alignment!)
///
#[derive(Debug, PartialEq)]
pub enum MappingResult {
    /// records mapped to a single Gene
    SingleGene(GeneId),
    /// record multimapped, i.e. records are consistent with multiple genes
    Multimapped(HashSet<GeneId>),
    /// records are inconsistent, pointing to different genes
    Inconsistent,
}

pub enum MappingMode {
    EC(InconsistentResolution), // just the EC as the molecular identiy of a read
    // IgnoreMultipleCbUmi,  // if we come arcoss a CB/UMI with more than one busrecord (,i.e. multiple ECs), just skip that cb/umi
    Gene(Ec2GeneMapper, InconsistentResolution),  // use the gene 
    Transcript(Ec2TranscriptMapper, InconsistentResolution),  // use the gene 
}

/// if we come across a CB/UMI the has inconsistent mapping
/// i.e. mapping to 2 different genes, how to handle
#[derive(Debug)]
pub enum InconsistentResolution {
    IgnoreInconsistent, // simply ignore the entire CB/UMI (as if we'd never seen it)
    AsDistinct, // treat as distinct entities, i.e. a chance collision of two different molecules (mRNAs)
    AsSingle,  // just ignore the fact that the moelcule might be different, aggregate all counts
}


/// For a set of busrecords (coming from the same molecule), this function tries
/// to map those records to genes consistently.
///
///  As all records come from the same mRNA,ideally we get a consistent
/// mapping (e.g. all records map to the same gene)
/// See [MappingResult] for mor explanation
pub fn find_consistent(records: &[BusRecord], ec2gene: &Ec2GeneMapper) -> MappingResult {
    /*
    set intersection in Rust is a MESS due to ownership etc!!
    Here's the strategy:
    1. copy the first set s1 entirely (the intersection will only contain elements of this set)
    2. go threought the second set s2: if an element in s2 is NOT present in s1, remove it from s1.
    */

    // this one does shrink the set, and uses the iterator

    // get the genes per BusRecord
    // let mut setlist = records.iter().map(|r| ec2gene.get(&r.EC).unwrap());
    let mut setlist = records.iter().map(|r| ec2gene.get_genes(EC(r.EC)));

    let s1 = setlist.next().unwrap();

    let mut shared_genes = s1.clone();

    // save some time if its only one record
    // actually not sure if this saves anything any more (after using MappingResult)
    // if we have one record, shared_Genes will be len==1, we pull out s1 (shared_Genes is empty now)
    // -> check if we can get rid of this block!
    if records.len() == 1 {
        if shared_genes.len() == 1 {
            let elem = *shared_genes.iter().next().unwrap();
            return MappingResult::SingleGene(elem);
        } else {
            return MappingResult::Multimapped(shared_genes);
        }
    }

    for current_set in setlist {
        // delete any elemt in shared_genes not present in current_set
        // i..e set intersection
        update_intersection_via_retain(&mut shared_genes, current_set);

        // to save some time: if the current intersection is already empty, it'll stay empty
        if shared_genes.is_empty() {
            break;
        }
    }
    match shared_genes.len() {
        0 => MappingResult::Inconsistent,
        1 => {
            let elem = *shared_genes.iter().next().unwrap();
            MappingResult::SingleGene(elem)
        }
        _ => MappingResult::Multimapped(shared_genes),
    }
}

/// Deals with the EC (equivalence class) to gene mapping
/// Resolve a given EC into a set of genes consistent with that EC
/// # Example
/// ```rust, no_run
/// # use bustools::io::BusFolder;
/// # use std::collections::HashSet;
/// # use bustools::consistent_genes::{Genename, EC};
/// let bfolder = BusFolder::new("/path/to/busfolder");
/// let ec2g = bfolder.make_mapper("/path/to/transcripts_to_genes.txt");
/// let ec = EC(1234);
/// let genenames: HashSet<Genename> = ec2g.get_genenames(ec);
/// ```
#[derive(Debug, Clone)]
pub struct Ec2GeneMapper {
    /*
    A struct to help deal with the complicated relationship between
    equivalence class (EC) and genes:
    A single EC maps to a SET of genes
    */
    ec2geneid: HashMap<EC, HashSet<GeneId>>, // maps each EC to a set of geneids (integers)

    // ec2geneid_array: Vec<HashSet<GeneId>>, // maps each EC to a set of geneids (integers)
    //
    // TODO this could be a simple Vec, indexing into it
    int_to_gene: HashMap<GeneId, Genename>, // for a given gene (ENSBEMLE id), get its unique ID (as recorded in ec2geneid)

    // for each gene get a EC that uniquely maps to that gene
    // currently removed: not really useful and convusion
    // geneid2ec: HashMap<GeneId, EC>, // consistent_ec_sets: HashSet<HashSet<EC>>  // what general EC combinations are allowed
}

// fn ec_combinatorics_for_gene(geneid: u32, ec2geneid: &HashMap<EC, HashSet<u32>>){
//     todo!();
//     // all ECs that contain the gene
//     let ecs_sets: HashMap<EC, HashSet<u32>> = ec2geneid.iter()
//         .filter(|(_ec, gset)| gset.contains(&geneid))
//         .map(|(_ec, gset)| (*_ec, gset.clone())).collect();

//     let mut current_consistent_sets :HashMap<EC, HashSet<u32>> = HashMap::new();

//     // for now, every candidate is a maybe
//     let mut current_maybe_sets :HashMap<EC, HashSet<u32>> = ecs_sets.clone();
//     let target_set = HashSet::from_iter(vec![geneid]);

//     // "recursion": just keep iterating until there's no more candidates
//     while  !current_maybe_sets.is_empty() {

//         let mut new_maybe_sets :HashMap<EC, HashSet<u32>> = HashMap::new();

//         // filter out anything thats not overlaping with target at all
//         for (ec, set) in current_maybe_sets{
//             let overlap = set.intersection(&target_set).count();
//             if overlap > 0{
//                 if set == target_set{ // the ones that are exacly the target set are added to the solution
//                     current_consistent_sets.insert(ec, set);
//                 }
//                 else{ // it contains more elements then the target set
//                     new_maybe_sets.insert(ec, set);
//                 }
//             }
//         }
//         // at this point we have a bunch of candidates in new_maybe_sets
//         // however, they are all too large and need to be combined with some other set
//         // to possibly result in the desired intersection
//         // 1. combine them with any of the current solutions: that'll be a sure

//         current_maybe_sets = new_maybe_sets;
//     }

// }
impl Ec2GeneMapper {
    /// make an Ec2GeneMapper from a HashMap of EC-> set(Genenames)
    pub fn new(ec2gene: HashMap<EC, HashSet<Genename>>) -> Self {
        // get all genes and sort them
        // could be done with a flatmap?
        // let gene_list2:HashSet<Gene> =  ec2gene
        //     .values()
        //     .flat_map(|set|
        //                 set.iter().map(|s| Gene(s.to_string()))
        //             )
        //     .collect();

        let mut gene_list: HashSet<Genename> = HashSet::new();
        for genes in ec2gene.values() {
            for g in genes {
                gene_list.insert(g.clone());
            }
        }
        let mut gene_vector: Vec<Genename> = gene_list.into_iter().collect();
        gene_vector.sort();

        // we're going to encode genes by ints hence we need
        // a mapping from gene-> int
        let gene_to_int: HashMap<Genename, GeneId> = gene_vector
            .into_iter()
            .enumerate()
            .map(|(i, g)| (g, GeneId(i as u32)))
            .collect();

        // the reverse
        let int_to_gene: HashMap<GeneId, Genename> =
            gene_to_int.iter().map(|(g, i)| (*i, g.clone())).collect();

        // mapping from EC to a set of gene_id
        // this is the main struct we'll be using!
        let mut ec2geneid: HashMap<EC, HashSet<GeneId>> = HashMap::new();
        for (ec, genes) in ec2gene.iter() {
            let geneids: HashSet<GeneId> = genes
                .iter()
                .map(|gname| *gene_to_int.get(gname).unwrap())
                .collect();
            ec2geneid.insert(*ec, geneids);
        }

        // inverse map from gene to EC
        // given a gene, get ANY EC that maps to it
        // this is all a bit ugly, so lwets remove it for now
        // let mut geneid2ec: HashMap<GeneId, EC> = HashMap::new();
        // for (ec, genes) in ec2gene.iter() {
        //     if genes.len() == 1 {
        //         let gname = genes.iter().next().unwrap().clone();
        //         let gid = *gene_to_int.get(&gname).unwrap();
        //         geneid2ec.insert(gid, *ec);
        //     }
        // }

        // //faster
        // // need to mark ec2gene ordered first
        // let mut ordered_ec2gene: BTreeMap<EC, HashSet<Genename>> = BTreeMap::new();
        // for (k,v) in ec2gene {
        //     ordered_ec2gene.insert(k ,v);
        // }
        // let mut _vec = Vec::with_capacity(ordered_ec2gene.len());
        // for (ec, genes) in ordered_ec2gene {
        //     let geneids: HashSet<GeneId> = genes
        //         .iter()
        //         .map(|gname| *gene_to_int.get(gname).unwrap())
        //         .collect();
        //     _vec.push(geneids);
        // }

        Ec2GeneMapper { ec2geneid, int_to_gene }
    }

    /// resolves an EC into a set of gene_ids
    pub fn get_genes(&self, ec: EC) -> &HashSet<GeneId> {
        self.ec2geneid.get(&ec).unwrap()
        // self.ec2geneid_array.get(ec.0 as usize).unwrap()
    }

    /// returns a set of genenames (usually ENSG) consistent with EC
    pub fn get_genenames(&self, ec: EC) -> HashSet<Genename> {
        // resolves an EC into a set of gene_names
        let geneids = self.get_genes(ec);
        let genenames = geneids
            .iter()
            .map(|gid| self.resolve_gene_id(*gid))
            .collect();
        genenames
    }

    /// resolves a gene_id into its genename (ENSEMBL)
    pub fn resolve_gene_id(&self, gene_id: GeneId) -> Genename {
        let r = self.int_to_gene.get(&gene_id).unwrap();
        r.clone()
    }

    /// returns a list of all genes in the mapping, sorted by int-id
    pub fn get_gene_list(&self) -> Vec<Genename> {
        let ngenes = self.int_to_gene.len();
        let genelist_vector: Vec<Genename> = (0..ngenes)
            .map(|k| self.resolve_gene_id(GeneId(k as u32)))
            .collect();
        genelist_vector
    }

    // fn resolve_geneid_to_ec_uniquely(&self, geneid: u32) -> Option<EC> {
    //     // just dereferencing whats inside the Option returned by the dict
    //     self.geneid2ec.get(&GeneId(geneid)).copied()
    // }
}

pub (crate) fn make_mapper(busfolder: &BusFolder, t2g_file: &str) -> Ec2GeneMapper{
    let t2g_dict = parse_t2g(t2g_file);
    let e2g = build_ec2gene(
        &busfolder.parse_ecmatrix(), 
        &busfolder.parse_transcript(),
        &t2g_dict);
    Ec2GeneMapper::new(e2g)
}


fn build_ec2gene(
    ec_dict: &HashMap<EC, Vec<TranscriptId>>,
    transcript_dict: &HashMap<TranscriptId, Transcriptname>,
    t2g_dict: &HashMap<Transcriptname, Genename>,
) -> HashMap<EC, HashSet<Genename>> {
    let mut ec2gene: HashMap<EC, HashSet<Genename>> = HashMap::new();

    for (ec, transcript_ints) in ec_dict.iter() {
        let mut genes: HashSet<Genename> = HashSet::new();

        for t_int in transcript_ints {
            let t_name = transcript_dict.get(t_int).unwrap();

            // if we can resolve, put the genename, otherwsise use the transcript name instead
            //
            // actually, turns out that kallisto/bustools treats it differently:
            // if the transcript doenst resolve, just drop tha trasncrip from the EC set
            // TODO what happens if non of the EC transcripts resolve
            if let Some(genename) = t2g_dict.get(t_name) {
                genes.insert(genename.clone());
            }
            // else { genes.insert(Genename(t_name.clone())); }
        }
        ec2gene.insert(*ec, genes);
    }
    // // sanity check, make sure no Ec set is empty
    // for (ec, gset) in ec2gene.iter(){
    //     if gset.is_empty(){
    //         println!("{ec:?}'s geneset is empty");
    //     }
    // }

    ec2gene
}

fn parse_t2g(t2g_file: &str) -> HashMap<Transcriptname, Genename> {
    let mut t2g_dict: HashMap<Transcriptname, Genename> = HashMap::new();
    let file = File::open(t2g_file).unwrap_or_else(|_| panic!("{} not found", t2g_file));
    let reader = BufReader::new(file);
    for line in reader.lines() {
        if let Ok(l) = line {
            let mut s = l.split_whitespace();
            let transcript_id = s.next().unwrap();
            let ensemble_id = s.next().unwrap();
            let _symbol = s.next().unwrap();

            let tname = Transcriptname(transcript_id.to_string());

            assert!(!t2g_dict.contains_key(&tname)); //make sure transcripts dont map to multiple genes
            t2g_dict.insert(tname, Genename(ensemble_id.to_string()));
        } else {
            panic!("Error readin lines from {}", t2g_file);
        }
    }
    t2g_dict
}



/// Represents a busrecord (actually an observed CB/UIM combination)
/// with a set of consistent genes (genes that this CB/UMI could map to)
/// and the number of times this combination was seen in the busfile
///
#[derive(Debug)]
#[allow(non_snake_case)]
pub struct CUGset {
    pub CB: u64,                    //8byte
    pub UMI: u64,                   // 8byte
    pub GENESET: HashSet<Genename>, // 4v byte
    pub COUNT: u32,                 // 4v byte
}

/// Group a set of busrecords (same CB/UMI) by genes they are consistent with
pub fn groubygene(records: Vec<BusRecord>, ec2gene: &Ec2GeneMapper) -> Vec<CUGset> {
    let mut emissions: Vec<CUGset> = Vec::with_capacity(records.len()); //with capacity worst case scenario

    // aggregate by overlpping genes
    let mut inter: Intersector<Genename, BusRecord> = Intersector::new();
    for r in records {
        let gset = ec2gene.get_genenames(EC(r.EC));
        inter.add(gset, r);
    }

    // for (gset, grouped_records) in inter.iterate_items(){
    for (gset, grouped_records) in izip!(inter.keys, inter.items) {
        let counts: u32 = grouped_records.iter().map(|x| x.COUNT).sum();
        let r1 = grouped_records.first().unwrap();

        let new_record = CUGset { CB: r1.CB, UMI: r1.UMI, GENESET: gset, COUNT: counts };
        emissions.push(new_record);
    }
    emissions
}

#[cfg(test)]
mod testing {
    use crate::{
        consistent_genes::{find_consistent, groubygene, Genename, MappingResult},
        io::BusRecord,
        utils::vec2set,
    };
    use std::collections::{HashMap, HashSet};

    use super::{Ec2GeneMapper, EC};

    #[test]
    fn test_consistent() {
        let ec0 = vec2set(vec![Genename("A".to_string())]);
        let ec1 = vec2set(vec![Genename("B".to_string())]);
        let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
        let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);

        let ec_dict: HashMap<EC, HashSet<Genename>> =
            HashMap::from([(EC(0), ec0), (EC(1), ec1), (EC(2), ec2), (EC(3), ec3)]);

        let es_mapper = Ec2GeneMapper::new(ec_dict);

        // not find_consistent doesnt check the records indeed come from the same CB/UMI

        // single read, consistent with A
        let r1 = BusRecord { CB: 0, UMI: 21, EC: 0, COUNT: 2, FLAG: 0 };
        let res1: MappingResult = find_consistent(&vec![r1], &es_mapper);
        match res1 {
            MappingResult::SingleGene(g) => {
                assert_eq!(Genename("A".to_string()), es_mapper.resolve_gene_id(g))
            }
            MappingResult::Multimapped(_) | MappingResult::Inconsistent => panic!(),
        }

        // single read, consistent with A and B, hence multimapped
        let r2 = BusRecord { CB: 0, UMI: 21, EC: 2, COUNT: 2, FLAG: 0 };
        let res2: MappingResult = find_consistent(&vec![r2], &es_mapper);
        println!("{:?}", res2);
        match res2 {
            MappingResult::SingleGene(_) | MappingResult::Inconsistent => panic!(),
            MappingResult::Multimapped(gs) => {
                let obs = gs
                    .iter()
                    .map(|g| es_mapper.resolve_gene_id(*g))
                    .collect::<HashSet<_>>();
                assert_eq!(
                    obs,
                    vec2set(vec![Genename("A".to_string()), Genename("B".to_string())])
                )
            }
        }

        // two reads, consistent with A
        let r3 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
        let r4 = BusRecord { CB: 3, UMI: 0, EC: 2, COUNT: 2, FLAG: 0 };
        let res3: MappingResult = find_consistent(&vec![r3, r4], &es_mapper);
        match res3 {
            MappingResult::SingleGene(g) => {
                assert_eq!(Genename("A".to_string()), es_mapper.resolve_gene_id(g))
            }
            MappingResult::Multimapped(_) | MappingResult::Inconsistent => panic!(),
        }

        // // two reads, inconsistent with A, B
        let r5 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
        let r6 = BusRecord { CB: 3, UMI: 0, EC: 1, COUNT: 2, FLAG: 0 };
        let res4 = find_consistent(&vec![r5, r6], &es_mapper);
        assert_eq!(res4, MappingResult::Inconsistent);

        // // three reads,  A, B, (A,B)
        // // inconsintent at the end
        let r7 = BusRecord { CB: 1, UMI: 3, EC: 0, COUNT: 2, FLAG: 0 };
        let r8 = BusRecord { CB: 3, UMI: 0, EC: 1, COUNT: 2, FLAG: 0 };
        let r9 = BusRecord { CB: 3, UMI: 0, EC: 2, COUNT: 2, FLAG: 0 };
        let res5 = find_consistent(&vec![r7, r8, r9], &es_mapper);
        assert_eq!(res5, MappingResult::Inconsistent)
    }

    #[test]
    fn test_ec_struct() {
        let ec0 = vec2set(vec![Genename("A".to_string())]);
        let ec1 = vec2set(vec![Genename("B".to_string())]);
        let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
        let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);

        let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
            (EC(0), ec0.clone()),
            (EC(1), ec1.clone()),
            (EC(2), ec2.clone()),
            (EC(3), ec3.clone()),
        ]);

        let es = Ec2GeneMapper::new(ec_dict);

        for (i, ec_set) in vec![ec0, ec1, ec2, ec3].iter().enumerate() {
            let gids = es.get_genes(EC(i as u32));
            let gnames: HashSet<_> = gids.iter().map(|i| es.resolve_gene_id(*i)).collect();
            assert_eq!(&gnames, ec_set);
        }
    }
    #[test]
    fn test_get_gene_list() {
        let ec0 = vec2set(vec![Genename("A".to_string())]);
        let ec1 = vec2set(vec![Genename("B".to_string())]);
        let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
        let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);

        let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
            (EC(0), ec0.clone()),
            (EC(1), ec1.clone()),
            (EC(2), ec2.clone()),
            (EC(3), ec3.clone()),
        ]);

        let es = Ec2GeneMapper::new(ec_dict);
        assert_eq!(
            es.get_gene_list(),
            vec![
                Genename("A".to_string()),
                Genename("B".to_string()),
                Genename("C".to_string()),
                Genename("D".to_string())
            ]
        )
    }

    // #[test]
    // fn test_resolve_geneid_2_ec() {
    //     let ec0 = vec2set(vec![Genename("A".to_string())]);
    //     let ec1 = vec2set(vec![Genename("B".to_string())]);
    //     let ec2 = vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
    //     let ec3 = vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);

    //     let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
    //         (EC(0), ec0.clone()),
    //         (EC(1), ec1.clone()),
    //         (EC(2), ec2.clone()),
    //         (EC(3), ec3.clone()),
    //     ]);
    //     let es = Ec2GeneMapper::new(ec_dict);

    //     // Gene A resolves to EC0
    //     assert_eq!(es.resolve_geneid_to_ec_uniquely(0), Some(EC(0)));
    //     // Gene Bresolves to EC1
    //     assert_eq!(es.resolve_geneid_to_ec_uniquely(1), Some(EC(1)));

    //     // Gene C cant be resolved uniquely (EC3 mapps to both C,D)
    //     assert_eq!(es.resolve_geneid_to_ec_uniquely(2), None);
    //     // Gene D cant be resolved uniquely (EC3 mapps to both C,D)
    //     assert_eq!(es.resolve_geneid_to_ec_uniquely(3), None);
    // }

    #[test]
    fn test_groubygene() {
        let ec0: HashSet<Genename> = vec2set(vec![Genename("A".to_string())]);
        let ec1: HashSet<Genename> = vec2set(vec![Genename("B".to_string())]);
        let ec2: HashSet<Genename> =
            vec2set(vec![Genename("A".to_string()), Genename("B".to_string())]);
        let ec3: HashSet<Genename> =
            vec2set(vec![Genename("C".to_string()), Genename("D".to_string())]);

        let ec_dict: HashMap<EC, HashSet<Genename>> = HashMap::from([
            (EC(0), ec0.clone()),
            (EC(1), ec1.clone()),
            (EC(2), ec2.clone()),
            (EC(3), ec3.clone()),
        ]);

        let es = Ec2GeneMapper::new(ec_dict);

        // first sample: two records, with consistent gene A
        let r1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 2, FLAG: 0 };
        let r2 = BusRecord { CB: 0, UMI: 1, EC: 2, COUNT: 2, FLAG: 0 };

        let r = groubygene(vec![r1, r2], &es);
        assert_eq!(r.len(), 1);
        let r1 = r.first().unwrap();
        assert_eq!(r1.COUNT, 4);
        assert_eq!(
            r1.GENESET,
            vec![Genename("A".to_string())]
                .into_iter()
                .collect::<HashSet<Genename>>()
        );

        // second sample: two records, with consistent gene A the other consistent with gene B
        let s1 = BusRecord { CB: 0, UMI: 1, EC: 0, COUNT: 3, FLAG: 0 }; // A
        let s2 = BusRecord { CB: 0, UMI: 1, EC: 1, COUNT: 4, FLAG: 0 }; //B
        let r = groubygene(vec![s1, s2], &es);
        assert_eq!(r.len(), 2);
        let r1 = &r[0];
        let r2 = &r[1];
        assert_eq!(r1.COUNT, 3);
        assert_eq!(r2.COUNT, 4);
    }

    // #[test]
    // #[allow(dead_code)]
    // fn test_ec() {
    //     let folder = "/home/michi/mounts/TB4drive/ISB_data/201015_NS500720_0063_AHV53GBGXG/kallisto_quant/01_Day2/kallisto/sort_bus/bus_output/";
    //     let t2g_file = "/home/michi/mounts/TB4drive/kallisto_resources/transcripts_to_genes.txt";
    //     let b = BusFolder::new(folder);
    //     let ecmapper = b.make_mapper(t2g_file);

    //     for g in 0..ecmapper.int_to_gene.len() {
    //         ecmapper.resolve_geneid_to_ec_uniquely(g as u32).unwrap();
    //         // let ec = ecmapper.geneid2ec.get(&(g as u32)).unwrap().0;
    //     }
    // }
}