1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
// Copyright (c) 2018 10X Genomics, Inc. All rights reserved.

// Extract zero-based human or mouse exon positions from Ensembl gtf file:
// { { chr-name, start, stop, fw?, gene-name, exon ) }.

use io_utils::open_for_read;
use std::{assert, format, i32, io::BufRead, str};
use string_utils::TextUtils;
use vector_utils::unique_sort;

pub fn fetch_exons(species: &str, exons: &mut Vec<(String, i32, i32, bool, String, i32)>) {
    assert!(species == "human" || species == "mouse");

    // Define gtf file location.  See notes in bin/build_vdj_ref.fs.

    let root = "/mnt/opt/meowmix_git/ensembl/release-94/gtf";
    let gtf: String;
    if species == "human" {
        gtf = format!(
            "{}/homo_sapiens/Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf",
            root
        );
    } else {
        gtf = format!("{}/mus_musculus/Mus_musculus.GRCm38.94.gtf", root);
    }

    // Parse the gtf file.

    exons.clear();

    let f = open_for_read![&gtf];
    for line in f.lines() {
        let s = line.unwrap();
        let fields: Vec<&str> = s.split_terminator('\t').collect();
        if fields.len() < 9 {
            continue;
        }
        let fields8: Vec<&str> = fields[8].split_terminator(';').collect();
        if fields8.len() < 6 {
            continue;
        }
        if !fields8[4].contains("exon_number") {
            continue;
        }
        if !fields8[5].contains("gene_name") {
            continue;
        }
        let exon = fields8[4].between("\"", "\"").force_i32();
        let gene = fields8[5].between("\"", "\"");
        // println!( "" );
        // for j in 0..fields.len() { println!( "{}: {}", j, fields[j] ); }
        // for j in 0..fields8.len() { println!( "8.{}: {}", j, fields8[j] ); }
        let chr = fields[0];
        let (start, stop) = (fields[3].force_i32(), fields[4].force_i32());
        let mut fw = false;
        if fields[6] == "+" {
            fw = true;
        }
        exons.push((chr.to_string(), start - 1, stop, fw, gene.to_string(), exon));
    }
    unique_sort(exons);
}