deepbiop_utils/
blat.rs

1use std::fs::File;
2use std::io::prelude::*;
3use std::io::BufReader;
4use std::io::Write;
5use std::path::Path;
6use std::path::PathBuf;
7use std::process::Command;
8
9use ahash::HashMap;
10use ahash::HashMapExt;
11use anyhow::Result;
12use derive_builder::Builder;
13use pyo3::prelude::*;
14use serde::{Deserialize, Serialize};
15use tempfile::tempdir;
16
17use pyo3_stub_gen::derive::*;
18
19pub const MIN_SEQ_SIZE: usize = 20;
20// psLayout version 3
21
22// match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T    T        block   blockSizes      qStarts  tStarts
23//         match   match           count   bases   count   bases           name            size    start   end     name            size    startend      count
24// ---------------------------------------------------------------------------------------------------------------------------------------------------------------
25// 23      1       0       0       0       0       0       0       +       seq     51      3       27      chr12   133275309       11447342     11447366 1       24,     3,      11447342,
26
27#[gen_stub_pyclass]
28#[pyclass(module = "deepbiop.utils")]
29#[derive(Debug, Default, Builder, Clone, Serialize, Deserialize)]
30pub struct PslAlignment {
31    #[pyo3(get, set)]
32    pub qname: String,
33    #[pyo3(get, set)]
34    pub qsize: usize,
35    #[pyo3(get, set)]
36    pub qstart: usize,
37    #[pyo3(get, set)]
38    pub qend: usize,
39    #[pyo3(get, set)]
40    pub qmatch: usize,
41    #[pyo3(get, set)]
42    pub tname: String,
43    #[pyo3(get, set)]
44    pub tsize: usize,
45    #[pyo3(get, set)]
46    pub tstart: usize,
47    #[pyo3(get, set)]
48    pub tend: usize,
49    #[pyo3(get, set)]
50    pub identity: f32,
51}
52
53pub fn parse_psl_by_qname<P: AsRef<Path>>(file: P) -> Result<HashMap<String, Vec<PslAlignment>>> {
54    let result = parse_psl(file)?;
55    Ok(result.into_iter().fold(HashMap::new(), |mut acc, al| {
56        let qname = al.qname.clone();
57        acc.entry(qname).or_default().push(al);
58        acc
59    }))
60}
61
62pub fn parse_psl<P: AsRef<Path>>(file: P) -> Result<Vec<PslAlignment>> {
63    let file = File::open(file)?;
64    let mut reader = BufReader::new(file);
65    let mut line = String::new();
66
67    // Skip the first 5 lines
68    for _ in 0..5 {
69        reader.read_line(&mut line)?;
70        line.clear();
71    }
72    let mut alignments = Vec::new();
73
74    // only get match, Qsize, qstart, qend, Tsize, tstart, tend
75    while reader.read_line(&mut line)? > 0 {
76        let fields: Vec<&str> = line.split_whitespace().collect();
77
78        let match_: usize = lexical::parse(fields[0])?;
79        let qname = fields[9];
80        let qsize: usize = lexical::parse(fields[10])?;
81        let qstart: usize = lexical::parse(fields[11])?;
82        let qend: usize = lexical::parse(fields[12])?;
83
84        let tname = fields[13];
85        let tsize: usize = lexical::parse(fields[14])?;
86        let tstart: usize = lexical::parse(fields[15])?;
87        let tend: usize = lexical::parse(fields[16])?;
88
89        let identity = match_ as f32 / qsize as f32;
90
91        let al = PslAlignmentBuilder::default()
92            .qname(qname.to_string())
93            .qsize(qsize)
94            .qstart(qstart)
95            .qend(qend)
96            .qmatch(match_)
97            .tname(tname.to_string())
98            .tsize(tsize)
99            .tstart(tstart)
100            .tend(tend)
101            .identity(identity)
102            .build()?;
103        alignments.push(al);
104        line.clear();
105    }
106    Ok(alignments)
107}
108
109pub fn blat_for_seq<P: AsRef<Path> + AsRef<std::ffi::OsStr>>(
110    path: P,
111    blat_cli: P,
112    two_bit: P,
113    output: P,
114) -> Result<Vec<PslAlignment>> {
115    let _output = Command::new(blat_cli)
116        .arg("-stepSize=5")
117        .arg("-repMatch=2253")
118        .arg("-minScore=20")
119        .arg("-minIdentity=0")
120        .arg(two_bit)
121        .arg(path)
122        .arg(&output)
123        .output()?;
124
125    if !_output.status.success() {
126        return Err(anyhow::anyhow!(
127            "blat failed: {}",
128            String::from_utf8_lossy(&_output.stderr)
129        ));
130    }
131    parse_psl(output)
132}
133
134// ./blat -stepSize=5 -repMatch=2253 -minScore=20 -minIdentity=0  hg38.2bit t.fa  output.psl
135pub fn blat<P: AsRef<Path> + AsRef<std::ffi::OsStr>>(
136    seq: &str,
137    blat_cli: P,
138    two_bit: P,
139    output: Option<&str>,
140) -> Result<Vec<PslAlignment>> {
141    log::debug!("blat_cli: {}", seq);
142
143    // Create a file inside `std::env::temp_dir()`.
144    let dir = tempdir()?;
145    let file1 = dir.path().join("seq.fa");
146    let mut tmp_file1 = File::create(file1.clone())?;
147    writeln!(tmp_file1, ">seq\n")?;
148    writeln!(tmp_file1, "{}", seq)?;
149
150    // Create a directory inside `std::env::temp_dir()`.
151    let output_file = if let Some(value) = output {
152        PathBuf::from(value)
153    } else {
154        dir.path().join("output.psl")
155    };
156
157    let _output = Command::new(blat_cli)
158        .arg("-stepSize=5")
159        .arg("-repMatch=2253")
160        .arg("-minScore=20")
161        .arg("-minIdentity=0")
162        .arg(two_bit)
163        .arg(file1)
164        .arg(output_file.clone())
165        .output()?;
166
167    if !_output.status.success() || !output_file.exists() {
168        return Err(anyhow::anyhow!(
169            "blat failed: {}",
170            String::from_utf8_lossy(&_output.stderr)
171        ));
172    }
173
174    parse_psl(output_file)
175}
176
177#[cfg(test)]
178mod tests {
179    // use super::*;
180
181    #[test]
182    fn test_parse_psl() {
183        let _file = "tests/data/sample.psl";
184    }
185}