alevin_fry/
utils.rs

1/*
2 * Copyright (c) 2020-2024 COMBINE-lab.
3 *
4 * This file is part of alevin-fry
5 * (see https://www.github.com/COMBINE-lab/alevin-fry).
6 *
7 * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
8 */
9use crate::constants as afconst;
10use crate::eq_class::IndexedEqList;
11use anyhow::{anyhow, Context};
12use bstr::io::BufReadExt;
13use core::fmt;
14use dashmap::DashMap;
15use libradicl::utils::SPLICE_MASK_U32;
16use needletail::bitkmer::*;
17use std::collections::{HashMap, HashSet};
18use std::error::Error;
19use std::fs::File;
20use std::io::{BufReader, BufWriter, Write};
21use std::path::Path;
22use std::path::PathBuf;
23use std::str::FromStr;
24use thiserror::Error;
25
26/*
27struct QuantArguments {
28    num_threads: u64,
29    num_bootstraps: u64,
30    init_uniform: bool,
31    summary_stat: bool,
32    dump_eq: bool,
33    use_mtx: bool,
34    input_dir: String,
35    output_dir: String,
36    tg_map: String,
37    resolution: ResolutionStrategy,
38    sa_model: SplicedAmbiguityModel,
39    small_thresh: u64,
40    filter_list: String
41}
42*/
43
44pub(crate) fn remove_file_if_exists(fname: &Path) -> anyhow::Result<()> {
45    if fname.exists() {
46        std::fs::remove_file(fname)
47            .with_context(|| format!("could not remove {}", fname.display()))?;
48    }
49    Ok(())
50}
51
52/// FROM https://github.com/10XGenomics/rust-debruijn/blob/master/src/dna_string.rs
53/// count Hamming distance between 2 2-bit DNA packed u64s
54pub(super) fn count_diff_2_bit_packed(a: u64, b: u64) -> usize {
55    let bit_diffs = a ^ b;
56    let two_bit_diffs = (bit_diffs | bit_diffs >> 1) & 0x5555555555555555;
57    two_bit_diffs.count_ones() as usize
58}
59
60#[inline(always)]
61fn unspliced_of(gid: u32) -> u32 {
62    gid + 1
63}
64
65/// should always compile to no-op
66#[inline(always)]
67fn spliced_of(gid: u32) -> u32 {
68    gid
69}
70
71// given a spliced or unspliced gene id, return
72// the spliced (canonical) id for this gene.
73#[inline(always)]
74fn spliced_id(gid: u32) -> u32 {
75    gid & SPLICE_MASK_U32
76}
77
78#[inline(always)]
79pub fn same_gene(g1: u32, g2: u32, with_unspliced: bool) -> bool {
80    (g1 == g2) || (with_unspliced && (spliced_id(g1) == spliced_id(g2)))
81}
82
83#[inline(always)]
84pub fn is_spliced(gid: u32) -> bool {
85    // if the id is even, then it's spliced
86    (0x1 & gid) == 0
87}
88
89#[inline(always)]
90pub fn is_unspliced(gid: u32) -> bool {
91    // if it's not spliced, then it is unspliced
92    !is_spliced(gid)
93}
94
95/// Write the permit_freq.bin and all_freq.bin files
96pub fn write_permit_list_freq(
97    o_path: &std::path::Path,
98    bclen: u16,
99    permit_freq_map: &HashMap<u64, u64, ahash::RandomState>,
100) -> Result<(), Box<dyn std::error::Error>> {
101    let output = std::fs::File::create(o_path)?;
102    let mut writer = BufWriter::new(&output);
103
104    {
105        // the first u64 represents file format version.
106        writer
107            .write_all(&afconst::PERMIT_FILE_VER.to_le_bytes())
108            .unwrap();
109
110        // the second u64 represents barcode length
111        writer.write_all(&(u64::from(bclen)).to_le_bytes()).unwrap();
112
113        // the rest records the permitted barcode:freq hashmap
114        bincode::serialize_into(&mut writer, &permit_freq_map)?;
115    }
116    Ok(())
117}
118
119/// Write the permit_freq.bin and all_freq.bin files
120pub fn write_permit_list_freq_dashmap(
121    o_path: &std::path::Path,
122    bclen: u16,
123    permit_freq_map: &DashMap<u64, u64, ahash::RandomState>,
124) -> Result<(), Box<dyn std::error::Error>> {
125    let output = std::fs::File::create(o_path)?;
126    let mut writer = BufWriter::new(&output);
127
128    {
129        // the first u64 represents file format version.
130        writer
131            .write_all(&afconst::PERMIT_FILE_VER.to_le_bytes())
132            .unwrap();
133
134        // the second u64 represents barcode length
135        writer.write_all(&(u64::from(bclen)).to_le_bytes()).unwrap();
136
137        // the rest records the permitted barcode:freq hashmap
138        bincode::serialize_into(&mut writer, &permit_freq_map)?;
139    }
140    Ok(())
141}
142
143/// Parse a 3 column tsv of the format
144/// transcript_name gene_name   status
145/// where status is one of S or U each gene will be allocated both a spliced and
146/// unspliced variant, the spliced index will always be even and the unspliced odd,
147/// and they will always be adjacent ids.  For example, if gene A is present in
148/// the sample and it's spliced variant is assigned id i,  then it will always be true that
149/// i % 2 == 0
150/// and
151/// (i+1) will be the id for the unspliced version of gene A
152fn parse_tg_spliced_unspliced(
153    rdr: &mut csv::Reader<File>,
154    ref_count: usize,
155    rname_to_id: &HashMap<String, u32, ahash::RandomState>,
156    gene_names: &mut Vec<String>,
157    gene_name_to_id: &mut HashMap<String, u32, ahash::RandomState>,
158) -> anyhow::Result<(Vec<u32>, bool)> {
159    // map each transcript id to the corresponding gene id
160    // the transcript name can be looked up from the id in the RAD header,
161    // and the gene name can be looked up from the id in the gene_names
162    // vector.
163    let mut tid_to_gid = vec![u32::MAX; ref_count];
164
165    // Record will be transcript, gene, splicing status
166    type TsvRec = (String, String, String);
167
168    // the transcripts for which we've found a gene mapping
169    let mut found = 0usize;
170
171    // starting from 0, we assign each gene 2 ids (2 consecutive integers),
172    // the even ids are for spliced txps, the odd ids are for unspliced txps
173    // for convenience, we define a gid helper, next_gid
174    let mut next_gid = 0u32;
175    // apparently the "header" (first row) will be included
176    // in the iterator returned by `deserialize` anyway
177    /*let hdr = rdr.headers()?;
178    let hdr_vec : Vec<Result<TsvRec,csv::Error>> = vec![hdr.deserialize(None)];
179    */
180    for result in rdr.deserialize() {
181        let record: TsvRec = result?;
182        // first, get the first id for this gene
183        let gene_id = *gene_name_to_id.entry(record.1.clone()).or_insert_with(|| {
184            // as we need to return the current next_gid if we run this code
185            // we add by two and then return current gene id.
186            let cur_gid = next_gid;
187            next_gid += 2;
188            // we haven't added this gene name already,
189            // we append it now to the list of gene names.
190            gene_names.push(record.1.clone());
191            cur_gid
192        });
193
194        // get the transcript id
195        if let Some(transcript_id) = rname_to_id.get(&record.0) {
196            found += 1;
197            if record.2.eq_ignore_ascii_case("U") {
198                // This is an unspliced txp
199                // we link it to the second gid of this gene
200                tid_to_gid[*transcript_id as usize] = unspliced_of(gene_id);
201            } else if record.2.eq_ignore_ascii_case("S") {
202                // This is a spliced txp, we link it to the
203                // first gid of this gene
204                tid_to_gid[*transcript_id as usize] = spliced_of(gene_id);
205            } else {
206                return Err(anyhow!(
207                    "Third column in 3 column txp-to-gene file must be S or U"
208                ));
209            }
210        }
211    }
212
213    assert_eq!(
214        found, ref_count,
215        "The tg-map must contain a gene mapping for all transcripts in the header"
216    );
217
218    Ok((tid_to_gid, true))
219}
220
221fn parse_tg_spliced(
222    rdr: &mut csv::Reader<File>,
223    ref_count: usize,
224    rname_to_id: &HashMap<String, u32, ahash::RandomState>,
225    gene_names: &mut Vec<String>,
226    gene_name_to_id: &mut HashMap<String, u32, ahash::RandomState>,
227) -> anyhow::Result<(Vec<u32>, bool)> {
228    // map each transcript id to the corresponding gene id
229    // the transcript name can be looked up from the id in the RAD header,
230    // and the gene name can be looked up from the id in the gene_names
231    // vector.
232    let mut tid_to_gid = vec![u32::MAX; ref_count];
233    // now read in the transcript to gene map
234    type TsvRec = (String, String);
235    // now, map each transcript index to it's corresponding gene index
236    let mut found = 0usize;
237    // apparently the "header" (first row) will be included
238    // in the iterator returned by `deserialize` anyway
239    /*let hdr = rdr.headers()?;
240    let hdr_vec : Vec<Result<TsvRec,csv::Error>> = vec![hdr.deserialize(None)];
241    */
242    for result in rdr.deserialize() {
243        match result {
244            Ok(record_in) => {
245                let record: TsvRec = record_in;
246                //let record: TSVRec = result?;
247                // first, get the id for this gene
248                let next_id = gene_name_to_id.len() as u32;
249                let gene_id = *gene_name_to_id.entry(record.1.clone()).or_insert(next_id);
250                // if we haven't added this gene name already, then
251                // append it now to the list of gene names.
252                if gene_id == next_id {
253                    gene_names.push(record.1.clone());
254                }
255                // get the transcript id
256                if let Some(transcript_id) = rname_to_id.get(&record.0) {
257                    found += 1;
258                    tid_to_gid[*transcript_id as usize] = gene_id;
259                }
260            }
261            Err(e) => {
262                /*
263                crit!(
264                log,
265                "Encountered error [{}] when reading the transcript-to-gene map. Please make sure the transcript-to-gene mapping is a 2 column, tab separated file.",
266                e
267                );
268                */
269                return Err(anyhow!(
270                    "failed to parse the transcript-to-gene map : {}.",
271                    e
272                ));
273            }
274        }
275    }
276
277    assert_eq!(
278        found, ref_count,
279        "The tg-map must contain a gene mapping for all transcripts in the header"
280    );
281
282    Ok((tid_to_gid, false))
283}
284
285pub fn parse_tg_map(
286    tg_map: &PathBuf,
287    ref_count: usize,
288    rname_to_id: &HashMap<String, u32, ahash::RandomState>,
289    gene_names: &mut Vec<String>,
290    gene_name_to_id: &mut HashMap<String, u32, ahash::RandomState>,
291) -> anyhow::Result<(Vec<u32>, bool)> {
292    let t2g_file = std::fs::File::open(tg_map).context("couldn't open file")?;
293    let mut rdr = csv::ReaderBuilder::new()
294        .has_headers(false)
295        .delimiter(b'\t')
296        .from_reader(t2g_file);
297
298    let headers = rdr.headers()?;
299    match headers.len() {
300        2 => {
301            // parse the 2 column format
302            parse_tg_spliced(
303                &mut rdr,
304                ref_count,
305                rname_to_id,
306                gene_names,
307                gene_name_to_id,
308            )
309        }
310        3 => {
311            // parse the 3 column format
312            parse_tg_spliced_unspliced(
313                &mut rdr,
314                ref_count,
315                rname_to_id,
316                gene_names,
317                gene_name_to_id,
318            )
319        }
320        _ => {
321            // not supported
322            Err(anyhow!(
323                "Transcript-gene mapping must have either 2 or 3 columns."
324            ))
325        }
326    }
327}
328
329/// Extracts UMI counts from the `gene_eqc` HashMap.
330/// This function is to be used when we are counting UMIs in
331/// USA mode, and when we do not wish to consider gene-ambiguous
332/// reads.
333/// UMIs will be assigned to the spliced, unspliced, or ambiguous
334/// version of their gene.  If a UMI is compatible with more than
335/// one gene, but only one *spliced* gene, then it is assigned to
336/// the spliced gene, unless there is too much multimapping
337/// (i.e. it is compatible with > 10 different loci).
338pub fn extract_counts(
339    gene_eqc: &HashMap<Vec<u32>, u32, ahash::RandomState>,
340    num_counts: usize,
341) -> Vec<f32> {
342    // the number of genes not considering status
343    // i.e. spliced, unspliced, ambiguous
344    let unspliced_offset = num_counts / 3;
345    let ambig_offset = 2 * unspliced_offset;
346    let mut counts = vec![0_f32; num_counts];
347
348    for (labels, count) in gene_eqc {
349        // the length of the label will tell us if this is a
350        // splicing-unique, gene-unique (but splicing ambiguous).
351        // or gene-ambiguous equivalence class label.
352        match labels.len() {
353            1 => {
354                // determine if spliced or unspliced
355                if let Some(gid) = labels.first() {
356                    let idx = if is_spliced(*gid) {
357                        (*gid >> 1) as usize
358                    } else {
359                        unspliced_offset + (*gid >> 1) as usize
360                    };
361                    counts[idx] += *count as f32;
362                }
363            }
364            2 => {
365                // spliced & unspliced of the same gene, or something differnet?
366                if let (Some(g1), Some(g2)) = (labels.first(), labels.last()) {
367                    if same_gene(*g1, *g2, true) {
368                        let idx = ambig_offset + (*g1 >> 1) as usize;
369                        //eprintln!("ambig count {} at {}!", *count, idx);
370                        counts[idx] += *count as f32;
371                    } else {
372                        // report spliced if we can
373                        match (is_spliced(*g1), is_spliced(*g2)) {
374                            (true, false) => {
375                                counts[(*g1 >> 1) as usize] += *count as f32;
376                            }
377                            (false, true) => {
378                                counts[(*g2 >> 1) as usize] += *count as f32;
379                            }
380                            _ => { /* do nothing */ }
381                        }
382                    }
383                }
384            }
385            3..=10 => {
386                // if we don't have *too* many distinct genes matching this UMI
387                // then apply the prefer-spliced rule.
388
389                // See if there is precisely 1 spliced gene, and if so take it
390                // but assign the read as ambiguous if it is for this gene
391                let mut iter = labels.iter();
392                // search for the first spliced index
393                if let Some(sidx) = iter.position(|&x| is_spliced(x)) {
394                    // if we found a spliced gene, check if there are any more
395                    if let Some(_sidx2) = iter.position(|&x| is_spliced(x)) {
396                        // in this case we had 2 spliced genes, so this is
397                        // gene ambiguous and we just drop it.
398                    } else {
399                        // we only had one spliced gene.  Check to see if the
400                        // index following the spliced gene we found is its
401                        // unspliced variant or not.  If so, add it as ambiguous
402                        // otherwise, add it as spliced
403                        if let Some(sg) = labels.get(sidx) {
404                            if let Some(ng) = labels.get(sidx + 1) {
405                                if same_gene(*sg, *ng, true) {
406                                    let idx = ambig_offset + (*sg >> 1) as usize;
407                                    counts[idx] += *count as f32;
408                                    continue;
409                                }
410                            }
411                            counts[(*sg >> 1) as usize] += *count as f32;
412                        }
413                    }
414                }
415            }
416            _ => {}
417        }
418    }
419    counts
420}
421
422/// Extracts UMI counts from the `gene_eqc` HashMap.
423/// This function is to be used when we are counting UMIs in
424/// USA mode.  Multimappers will be uniformly allocated to the
425/// genes to which they map.
426pub fn extract_counts_mm_uniform(
427    gene_eqc: &HashMap<Vec<u32>, u32, ahash::RandomState>,
428    num_counts: usize,
429) -> Vec<f32> {
430    // the number of genes not considering status
431    // i.e. spliced, unspliced, ambiguous
432    let unspliced_offset = num_counts / 3;
433    let ambig_offset = 2 * unspliced_offset;
434    let mut counts = vec![0_f32; num_counts];
435    let mut tvec = Vec::<usize>::with_capacity(16);
436
437    for (labels, count) in gene_eqc {
438        // the length of the label will tell us if this is a
439        // splicing-unique, gene-unique (but splicing ambiguous).
440        // or gene-ambiguous equivalence class label.
441        match labels.len() {
442            1 => {
443                // determine if spliced or unspliced
444                if let Some(gid) = labels.first() {
445                    let idx = if is_spliced(*gid) {
446                        (*gid >> 1) as usize
447                    } else {
448                        unspliced_offset + (*gid >> 1) as usize
449                    };
450                    counts[idx] += *count as f32;
451                }
452            }
453            _ => {
454                // iterate over all of the genes
455                let mut iter = labels.iter().peekable();
456                tvec.clear();
457                while let Some(gn) = iter.next() {
458                    // the base index of this gene
459                    let mut idx = (gn >> 1) as usize;
460                    // if the current gene is spliced
461                    // check if the next item is the unspliced version
462                    // of this gene.
463                    if is_spliced(*gn) {
464                        if let Some(ng) = iter.peek() {
465                            // if this is the unspliced version
466                            // of the same gene, then the count allocation
467                            // goes to the ambiguous label
468                            if same_gene(*gn, **ng, true) {
469                                idx += ambig_offset;
470                                // advance the iterator so we don't see
471                                // this again.
472                                iter.next();
473                            }
474                            // if it's not the same gene then add the
475                            // contribution to the spliced molecule
476                            // so do nothing here
477                        }
478                    } else {
479                        // this is unspliced, so even if there is a next element
480                        // it cannot belong to the same gene.
481                        // modify the index so the contribution is
482                        // to the unspliced gene index.
483                        idx += unspliced_offset;
484                    }
485                    tvec.push(idx)
486                }
487                let fcount = (*count as f32) / (tvec.len() as f32);
488                for g in &tvec {
489                    counts[*g] += fcount;
490                }
491            }
492        }
493    }
494    counts
495}
496
497/// Extracts an `IndexedEqList` and equivalence class ID / count
498/// vector from the `gene_eqc` HashMap.
499/// This function is used in USA-mode when we wish to resolve
500/// multi-mapping UMIs via an EM algorithm. Equivalence class
501/// labels (stored in `idx_eq_list`) will contain
502/// spliced, unspliced and ambiguous gene IDs based on UMI mappings,
503/// and `eq_id_count` will enumerate the count of UMIs for each
504/// observed equivalence class.
505pub fn extract_usa_eqmap(
506    gene_eqc: &HashMap<Vec<u32>, u32, ahash::RandomState>,
507    num_counts: usize,
508    idx_eq_list: &mut IndexedEqList,
509    eq_id_count: &mut Vec<(u32, u32)>,
510) {
511    // We use a little trick here.  Even though the resulting
512    // USA-mode equivalence classes will not be over the same set
513    // of gene IDs as the input list, we *do* know there will be
514    // a 1-1 correspondence, such that each equivalence class label
515    // in `gene_eqc` will produce exactly one USA-mode equivalence
516    // class label, and that each USA-mode equivalence class label
517    // will be unique.  This means we can just clear out our
518    // `idx_eq_list` and add the new class labels and counts as we
519    // encounter them.
520    idx_eq_list.clear();
521    eq_id_count.clear();
522
523    // i.e. spliced, unspliced, ambiguous
524    let unspliced_offset = num_counts / 3;
525    let ambig_offset = 2 * unspliced_offset;
526    let mut tvec = Vec::<u32>::with_capacity(16);
527
528    for (ctr, (labels, count)) in gene_eqc.iter().enumerate() {
529        // the length of the label will tell us if this is a
530        // splicing-unique, gene-unique (but splicing ambiguous).
531        // or gene-ambiguous equivalence class label.
532        match labels.len() {
533            1 => {
534                // determine if spliced or unspliced
535                if let Some(gid) = labels.first() {
536                    let idx = if is_spliced(*gid) {
537                        (*gid >> 1) as usize
538                    } else {
539                        unspliced_offset + (*gid >> 1) as usize
540                    };
541                    idx_eq_list.add_single_label(idx as u32);
542                    eq_id_count.push((ctr as u32, *count));
543                }
544            }
545            _ => {
546                // iterate over all of the genes
547                let mut iter = labels.iter().peekable();
548                tvec.clear();
549                while let Some(gn) = iter.next() {
550                    // the base index of this gene
551                    let mut idx = (gn >> 1) as usize;
552                    // if the current gene is spliced
553                    // check if the next item is the unspliced version
554                    // of this gene.
555                    if is_spliced(*gn) {
556                        if let Some(ng) = iter.peek() {
557                            // if this is the unspliced version
558                            // of the same gene, then the count allocation
559                            // goes to the ambiguous label
560                            if same_gene(*gn, **ng, true) {
561                                idx += ambig_offset;
562                                // advance the iterator so we don't see
563                                // this again.
564                                iter.next();
565                            }
566                            // if it's not the same gene then add the
567                            // contribution to the spliced molecule
568                            // so do nothing here
569                        }
570                    } else {
571                        // this is unspliced, so even if there is a next element
572                        // it cannot belong to the same gene.
573                        // modify the index so the contribution is
574                        // to the unspliced gene index.
575                        idx += unspliced_offset;
576                    }
577                    tvec.push(idx as u32);
578                }
579                // NOTE: the tvec won't necessarily be in sorted order
580                // however, because we know the original eqc labels
581                // and the USA mode labels are 1-1, we don't need this
582                // so avoid the sort here.
583                idx_eq_list.add_label_vec(tvec.as_slice());
584                eq_id_count.push((ctr as u32, *count));
585            }
586        }
587    }
588}
589
590pub fn get_bit_mask(nt_index: usize, fill_with: u64) -> u64 {
591    let mut mask: u64 = fill_with;
592    mask <<= 2 * (nt_index - 1);
593    mask
594}
595
596pub fn get_all_snps(bc: u64, bc_length: usize) -> Vec<u64> {
597    assert!(
598        bc <= 2u64.pow(2 * bc_length as u32),
599        "the barcode id is larger than possible (based on barcode length)"
600    );
601    assert!(
602        bc_length <= 32,
603        "barcode length greater than 32 not supported"
604    );
605
606    let mut snps: Vec<u64> = Vec::with_capacity(3 * bc_length);
607
608    for nt_index in 1..=bc_length {
609        // clearing the two relevant bits based on nucleotide position
610        let bit_mask = bc & !get_bit_mask(nt_index, 3);
611
612        // iterating over all 4 choices of the nucleotide
613        for i in 0..=3 {
614            let new_bc = bit_mask | get_bit_mask(nt_index, i);
615            if new_bc != bc {
616                snps.push(new_bc);
617            }
618        }
619    }
620
621    snps
622}
623
624pub fn get_all_indels(bc: u64, bc_length: usize) -> Vec<u64> {
625    assert!(
626        bc <= 2u64.pow(2 * bc_length as u32),
627        "the barcode id is larger than possible (based on barcode length)"
628    );
629    assert!(
630        bc_length <= 32,
631        "barcode length greater than 32 not supported"
632    );
633
634    let mut indels: Vec<u64> = Vec::with_capacity(8 * (bc_length - 1));
635
636    for nt_index in 1..bc_length {
637        let mut bit_mask = 1 << (2 * nt_index);
638        bit_mask -= 1;
639
640        let upper_half = bc & !bit_mask;
641        let lower_half = bc & bit_mask;
642
643        // iterating over all 4 choices of the nucleotide
644        for i in 0..=3 {
645            let new_insertion_bc = upper_half | get_bit_mask(nt_index, i) | (lower_half >> 2);
646            let new_deletion_bc = upper_half
647                | get_bit_mask(1, i)
648                | ((lower_half & !get_bit_mask(nt_index + 1, 3)) << 2);
649
650            if new_insertion_bc != bc {
651                indels.push(new_insertion_bc);
652            }
653            if new_deletion_bc != bc {
654                indels.push(new_deletion_bc);
655            }
656        }
657    }
658
659    indels
660}
661
662pub fn get_all_one_edit_neighbors(
663    bc: u64,
664    bc_length: usize,
665    neighbors: &mut HashSet<u64>,
666) -> Result<(), Box<dyn Error>> {
667    neighbors.clear();
668
669    let snps: Vec<u64> = get_all_snps(bc, bc_length);
670    let indels: Vec<u64> = get_all_indels(bc, bc_length);
671
672    neighbors.extend(&snps);
673    neighbors.extend(&indels);
674
675    Ok(())
676}
677
678pub fn generate_whitelist_set(
679    whitelist_bcs: &[u64],
680    bc_length: usize,
681) -> Result<HashSet<u64>, Box<dyn Error>> {
682    let num_bcs = whitelist_bcs.len();
683
684    let mut one_edit_barcode_hash: HashSet<u64> = HashSet::new();
685    let mut neighbors: HashSet<u64> = HashSet::new();
686    one_edit_barcode_hash.reserve(10 * num_bcs);
687    // reserved space for 3*length SNP
688    // + 4 * (length -1) insertion
689    // + 4 * (length -1) deletion
690    neighbors.reserve(3 * bc_length + 8 * (bc_length - 1));
691
692    for bc in whitelist_bcs {
693        get_all_one_edit_neighbors(*bc, bc_length, &mut neighbors)?;
694        one_edit_barcode_hash.extend(&neighbors);
695    }
696
697    Ok(one_edit_barcode_hash)
698}
699
700/**
701 * generates a map that contains all one edit distance neighbors
702 * of the permitted barcodes.  The key is the neighbor and the value
703 * is the original permitted barcode to which it maps.
704 **/
705pub fn generate_permitlist_map(
706    permit_bcs: &[u64],
707    bc_length: usize,
708) -> Result<HashMap<u64, u64>, Box<dyn Error>> {
709    let num_bcs = permit_bcs.len();
710
711    let mut one_edit_barcode_map: HashMap<u64, u64> = HashMap::with_capacity(10 * num_bcs);
712    // first insert everything already in the explicit permitlist
713    for bc in permit_bcs {
714        one_edit_barcode_map.insert(*bc, *bc);
715    }
716
717    // reserved space for 3*length SNP
718    // + 4 * (length -1) insertion
719    // + 4 * (length -1) deletion
720    let mut neighbors: HashSet<u64> = HashSet::with_capacity(3 * bc_length + 8 * (bc_length - 1));
721
722    for bc in permit_bcs {
723        get_all_one_edit_neighbors(*bc, bc_length, &mut neighbors)?;
724        for n in &neighbors {
725            one_edit_barcode_map.entry(*n).or_insert(*bc);
726        }
727    }
728
729    Ok(one_edit_barcode_map)
730}
731
732/// Reads the contents of the file `flist`, which should contain
733/// a single barcode per-line, and returns a Result that is either
734/// a HashSet containing the k-mer encoding of all barcodes or
735/// the Error that was encountered parsing the file.
736pub fn read_filter_list(
737    flist: &PathBuf,
738    bclen: u16,
739) -> anyhow::Result<HashSet<u64, ahash::RandomState>> {
740    let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
741    let mut fset = HashSet::<u64, ahash::RandomState>::with_hasher(s);
742
743    let filt_file = std::fs::File::open(flist).context("couldn't open file")?;
744    let mut reader = BufReader::new(filt_file);
745
746    // Read the file line by line using the lines() iterator from std::io::BufRead.
747    reader
748        .for_byte_line(|line| {
749            let mut bnk = BitNuclKmer::new(line, bclen as u8, false);
750            let (_, k, _) = bnk.next().unwrap();
751            fset.insert(k.0);
752            Ok(true)
753        })
754        .unwrap();
755
756    Ok(fset)
757}
758
759pub fn is_velo_mode(input_dir: &PathBuf) -> bool {
760    let parent = std::path::Path::new(input_dir);
761    // open the metadata file and read the json
762    let meta_data_file = File::open(parent.join("generate_permit_list.json"))
763        .expect("could not open the generate_permit_list.json file.");
764    let mdata: serde_json::Value = serde_json::from_reader(meta_data_file)
765        .expect("could not deseralize generate_permit_list.json");
766    let vm = mdata.get("velo_mode");
767    match vm {
768        Some(v) => v.as_bool().unwrap_or(false),
769        None => false,
770    }
771}
772
773#[allow(dead_code)]
774#[derive(Debug, PartialEq, Eq)]
775pub struct InternalVersionInfo {
776    pub major: u32,
777    pub minor: u32,
778    pub patch: u32,
779}
780
781impl InternalVersionInfo {
782    pub fn is_compatible_with(&self, other: &InternalVersionInfo) -> Result<(), String> {
783        if self.major == other.major && self.minor == other.minor {
784            Ok(())
785        } else {
786            let s = format!(
787                "running alevin-fry {} on {} results, please regenerate the results using alevin-fry {} or greater",
788                self, other, self
789            );
790            Err(s)
791        }
792    }
793}
794
795impl fmt::Display for InternalVersionInfo {
796    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
797        write!(f, "v{}.{}.{}", self.major, self.minor, self.patch)
798    }
799}
800
801#[derive(Error, Debug)]
802pub enum VersionParseError {
803    #[error("The version string should be of the format x.y.z; it was `{0}`")]
804    IncorrectFormat(String),
805}
806
807impl FromStr for InternalVersionInfo {
808    type Err = VersionParseError;
809
810    fn from_str(vs: &str) -> Result<Self, Self::Err> {
811        let versions: Vec<u32> = vs.split('.').map(|s| s.parse::<u32>().unwrap()).collect();
812        if versions.len() != 3 {
813            return Err(VersionParseError::IncorrectFormat(vs.to_string()));
814        }
815        Ok(Self {
816            major: versions[0],
817            minor: versions[1],
818            patch: versions[2],
819        })
820    }
821}
822
823#[cfg(test)]
824mod tests {
825    use crate::utils::generate_whitelist_set;
826    use crate::utils::get_all_indels;
827    use crate::utils::get_all_one_edit_neighbors;
828    use crate::utils::get_all_snps;
829    use crate::utils::get_bit_mask;
830    use crate::utils::InternalVersionInfo;
831    use std::collections::HashSet;
832    use std::str::FromStr;
833
834    #[test]
835    fn test_version_info() {
836        let vi = InternalVersionInfo::from_str("1.2.3").unwrap();
837        assert_eq!(
838            vi,
839            InternalVersionInfo {
840                major: 1,
841                minor: 2,
842                patch: 3
843            }
844        );
845    }
846
847    #[test]
848    fn test_get_bit_mask() {
849        let mut output = Vec::new();
850        for i in 0..=3 {
851            let mask = get_bit_mask(2, i);
852            output.push(mask);
853        }
854        assert_eq!(output, vec![0, 4, 8, 12]);
855    }
856
857    #[test]
858    fn test_get_all_snps() {
859        let mut output: Vec<u64> = get_all_snps(7, 3).into_iter().collect();
860        output.sort_unstable();
861
862        assert_eq!(output, vec![3, 4, 5, 6, 11, 15, 23, 39, 55]);
863    }
864
865    #[test]
866    fn test_get_all_indels() {
867        let mut output: Vec<u64> = get_all_indels(7, 3).into_iter().collect();
868        output.sort_unstable();
869        output.dedup();
870
871        assert_eq!(output, vec![1, 4, 5, 6, 9, 12, 13, 14, 15, 28, 29, 30, 31]);
872    }
873
874    #[test]
875    fn test_get_all_one_edit_neighbors() {
876        let mut neighbors: HashSet<u64> = HashSet::new();
877        get_all_one_edit_neighbors(7, 3, &mut neighbors).unwrap();
878
879        let mut output: Vec<u64> = neighbors.into_iter().collect();
880
881        output.sort_unstable();
882        output.dedup();
883
884        assert_eq!(
885            output,
886            vec![1, 3, 4, 5, 6, 9, 11, 12, 13, 14, 15, 23, 28, 29, 30, 31, 39, 55]
887        );
888    }
889
890    #[test]
891    fn test_generate_whitelist_hash() {
892        let neighbors: HashSet<u64> = generate_whitelist_set(&[7], 3).unwrap();
893        let mut output: Vec<u64> = neighbors.into_iter().collect();
894
895        output.sort_unstable();
896        output.dedup();
897
898        assert_eq!(
899            output,
900            vec![1, 3, 4, 5, 6, 9, 11, 12, 13, 14, 15, 23, 28, 29, 30, 31, 39, 55]
901        );
902    }
903}
alevin_fry/utils.rs

alevin_fry/
utils.rs