hmmer_rs/
hmmsearch.rs

1use log::*;
2use std::ffi::{CStr, CString};
3
4use crate::{hmm::*, libhmmer_sys_extras, EaselSequence};
5
6pub struct HmmerPipeline {
7    info: HmmsearchWorkerInfo,
8}
9
10impl HmmerPipeline {
11    pub fn new(hmm: &Hmm) -> HmmerPipeline {
12        // P7_PROFILE      *gm      = NULL;
13        // P7_OPROFILE     *om      = NULL;       /* optimized query profile                  */
14        #[allow(unused_assignments)]
15        let mut gm: *mut libhmmer_sys::P7_PROFILE = std::ptr::null_mut();
16        #[allow(unused_assignments)]
17        let mut om: *mut libhmmer_sys::P7_OPROFILE = std::ptr::null_mut();
18        // int              textw    = 0;
19
20        // WORKER_INFO     *info     = NULL;
21        let bg = unsafe { libhmmer_sys::p7_bg_Create(hmm.c_alphabet()) };
22        debug!("Background model created successfully");
23
24        //   /* Convert to an optimized model */
25        // gm = p7_profile_Create (hmm->M, abc);
26        // om = p7_oprofile_Create(hmm->M, abc);
27        // p7_ProfileConfig(hmm, info->bg, gm, 100, p7_LOCAL); /* 100 is a dummy length for now; and MSVFilter requires local mode */
28        // p7_oprofile_Convert(gm, om);                  /* <om> is now p7_LOCAL, multihit */
29        let abc = hmm.c_alphabet();
30        gm = unsafe { libhmmer_sys::p7_profile_Create(hmm.length() as i32, abc) };
31        debug!("Profile created successfully");
32        om = unsafe { libhmmer_sys::p7_oprofile_Create(hmm.length() as i32, abc) };
33        debug!("Optimized profile created successfully");
34
35        unsafe {
36            libhmmer_sys::p7_ProfileConfig(hmm.c_hmm, bg, gm, 100, libhmmer_sys_extras::p7_LOCAL);
37            debug!("Profile configured successfully");
38            libhmmer_sys::p7_oprofile_Convert(gm, om);
39            debug!("Optimized profile converted successfully");
40        }
41
42        // /* Create processing pipeline and hit list */
43        // info[i].th  = p7_tophits_Create();
44        // info[i].om  = p7_oprofile_Clone(om);
45        // info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */
46        // status = p7_pli_NewModel(info[i].pli, info[i].om, info[i].bg);
47        // if (status == eslEINVAL) p7_Fail(info->pli->errbuf);
48        let th = unsafe { libhmmer_sys::p7_tophits_Create() };
49        debug!("Tophits created successfully");
50        // let om = libhmmer_sys::p7_oprofile_Clone(om);
51        let pli = unsafe {
52            libhmmer_sys::p7_pipeline_Create(
53                std::ptr::null_mut(),
54                (*om).M,
55                100,
56                0,
57                libhmmer_sys_extras::p7_SEARCH_SEQS as u32,
58            )
59        };
60        debug!("Pipeline created successfully");
61        unsafe {
62            let status = libhmmer_sys::p7_pli_NewModel(pli, om, bg);
63            if status == libhmmer_sys::eslEINVAL as i32 {
64                panic!(); // TODO: Better msg
65            }
66        }
67        debug!("Pipeline new model created successfully");
68
69        let info = HmmsearchWorkerInfo { th, om, pli, bg };
70
71        HmmerPipeline { info }
72    }
73
74    pub fn run_hmm_on_file(&mut self, hmm: &Hmm, fasta_path: &std::path::Path) -> HmmsearchResult {
75        debug!("Starting run_hmm_on_file");
76        #[allow(unused_mut)]
77        let mut dbfile = Self::open_target_sequences(&fasta_path.to_string_lossy());
78        debug!("Target sequences opened successfully");
79
80        // TODO: output_header(ofp, go, cfg->hmmfile, cfg->dbfile);
81
82        //       esl_sqfile_SetDigital(dbfp, abc); //ReadBlock requires knowledge of the alphabet to decide how best to read blocks
83        unsafe {
84            libhmmer_sys::esl_sqfile_SetDigital(dbfile, hmm.c_alphabet());
85        }
86        debug!("Target sequences set to digital successfully");
87
88        // if (fprintf(ofp, "Query:       %s  [M=%d]\n", hmm->name, hmm->M)  < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
89        // if (hmm->acc)  { if (fprintf(ofp, "Accession:   %s\n", hmm->acc)  < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
90        // if (hmm->desc) { if (fprintf(ofp, "Description: %s\n", hmm->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
91        println!("Query:       {}  [M={}]", hmm.name(), hmm.length());
92        println!("Accession:   {}", hmm.acc());
93        println!("Description: {}", hmm.desc());
94
95        // sstatus = serial_loop(info, dbfp, cfg->n_targetseq);
96        // TODO: n_targetseq == -1 means no limit. OK for now.
97        let sstatus = self.serial_loop_over_esl_sqio(dbfile, -1);
98
99        // switch(sstatus)
100        // {
101        // case eslEFORMAT:
102        //   esl_fatal("Parse failed (sequence file %s):\n%s\n",
103        //       dbfp->filename, esl_sqfile_GetErrorBuf(dbfp));
104        //   break;
105        // case eslEOF:
106        //   /* do nothing */
107        //   break;
108        // default:
109        //   esl_fatal("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename);
110        // }
111
112        // Unclear to me why restating these consts is necessary to avoid
113        // compile-time warnings.
114        const ESL_FORMAT: i32 = libhmmer_sys::eslEFORMAT as i32;
115        const ESL_EOF: i32 = libhmmer_sys::eslEOF as i32;
116        match sstatus {
117            ESL_FORMAT => {
118                // panic!("Parse failed (sequence file {}):\n{}", fasta_path.to_string_lossy(), unsafe { libhmmer_sys::esl_sqfile_GetErrorBuf(dbfile) });
119                // TODO: Make the above compile
120                panic!(
121                    "Parse failed (sequence file {})",
122                    fasta_path.to_string_lossy()
123                );
124            }
125            ESL_EOF => {
126                // do nothing
127            }
128            _ => {
129                panic!(
130                    "Unexpected error {} reading sequence file {}",
131                    sstatus,
132                    fasta_path.to_string_lossy()
133                );
134            }
135        }
136
137        // /* merge the results of the search results */
138        // for (i = 1; i < infocnt; ++i)
139        // {
140        // p7_tophits_Merge(info[0].th, info[i].th);
141        // p7_pipeline_Merge(info[0].pli, info[i].pli);
142
143        // p7_pipeline_Destroy(info[i].pli);
144        // p7_tophits_Destroy(info[i].th);
145        // p7_oprofile_Destroy(info[i].om);
146        // }
147
148        // ===> This block of code is not necessary because we only have one thread.
149
150        // /* Print the results.  */
151        // p7_tophits_SortBySortkey(info->th);
152        // p7_tophits_Threshold(info->th, info->pli);
153        // p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
154        // p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
155        unsafe {
156            libhmmer_sys::p7_tophits_SortBySortkey(self.info.th);
157            libhmmer_sys::p7_tophits_Threshold(self.info.th, self.info.pli);
158
159            // TODO: Implement (optional) output of default output.
160            // panic!("Need to get FILE* from fopen or stdout to do this");
161            // libhmmer_sys::p7_tophits_Targets(FIXME, (*info).th, (*info).pli, textw);
162            // libhmmer_sys::p7_tophits_Domains(FIXME, (*info).th, (*info).pli, textw);
163        }
164
165        // if (tblfp)     p7_tophits_TabularTargets(tblfp,    hmm->name, hmm->acc, info->th, info->pli, (nquery == 1));
166        // if (domtblfp)  p7_tophits_TabularDomains(domtblfp, hmm->name, hmm->acc, info->th, info->pli, (nquery == 1));
167        // if (pfamtblfp) p7_tophits_TabularXfam(pfamtblfp, hmm->name, hmm->acc, info->th, info->pli);
168        // TODO: The above. I don't need them for now.
169
170        // TODO: Destroy, free, etc.
171
172        HmmsearchResult {
173            c_th: self.info.th,
174            c_pli: self.info.pli,
175        }
176    }
177
178    // TODO: Add a coresponding free() method
179    fn open_target_sequences(fasta_file: &str) -> *mut libhmmer_sys::esl_sqio_s {
180        //   int              dbfmt    = eslSQFILE_UNKNOWN; /* format code for sequence database file          */
181        let dbfmt = 0; //libhmmer_sys::eslSQFILE_UNKNOWN;
182
183        //   ESL_SQFILE      *dbfp     = NULL;              /* open input sequence file                        */
184        let mut dbfp: *mut libhmmer_sys::ESL_SQFILE = std::ptr::null_mut();
185
186        //   /* Open the target sequence database */
187        //   status = esl_sqfile_Open(cfg->dbfile, dbfmt, p7_SEQDBENV, &dbfp);
188        //   if      (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n",          cfg->dbfile);
189        //   else if (status == eslEFORMAT)   p7_Fail("Sequence file %s is empty or misformatted\n",            cfg->dbfile);
190        //   else if (status == eslEINVAL)    p7_Fail("Can't autodetect format of a stdin or .gz seqfile");
191        //   else if (status != eslOK)        p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->dbfile);
192        let file_pointer = CString::new(fasta_file.as_bytes()).unwrap().into_raw();
193        let status = unsafe {
194            // Open the file not assuming anything about its format, and let
195            // autodetect do its thing. Possibly we should use eslSQFILE_FASTA.
196            libhmmer_sys::esl_sqfile_Open(file_pointer, dbfmt, std::ptr::null(), &mut dbfp)
197        };
198        println!("Opened fasta file with status {status}");
199
200        if status == libhmmer_sys::eslENOTFOUND as i32 {
201            panic!("Failed to open sequence file {fasta_file} for reading");
202        } else if status == libhmmer_sys::eslEFORMAT as i32 {
203            panic!("Sequence file {fasta_file} is empty or misformatted");
204        } else if status == libhmmer_sys::eslEINVAL as i32 {
205            panic!("Can't autodetect format of a stdin or .gz seqfile");
206        } else if status != libhmmer_sys::eslOK as i32 {
207            panic!("Unexpected error {status} opening sequence file {fasta_file}");
208        }
209        dbfp
210    }
211
212    /// This method (called serial_loop in C) is not available in libhmmer_sys,
213    /// so we have to implement it here. Intended as a direct replacement for
214    /// the C function.
215    fn serial_loop_over_esl_sqio(
216        &mut self,
217        dbfp: *mut libhmmer_sys::esl_sqio_s,
218        n_targetseqs: i32,
219    ) -> i32 {
220        debug!("serial_loop");
221
222        //   int              status;                       /* easel return code                               */
223        let mut sstatus: i32;
224
225        //   ESL_SQ   *dbsq     = NULL;   /* one target sequence (digital)  */
226        #[allow(unused_mut, unused_assignments)]
227        let mut dbsq: *mut libhmmer_sys::ESL_SQ = std::ptr::null_mut();
228
229        // int seq_cnt = 0;
230        let mut seq_cnt: i32 = 0;
231
232        // For convenience
233        let info = &mut self.info;
234
235        // dbsq = esl_sq_CreateDigital(info->om->abc);
236        dbsq = unsafe {
237            let abc_here = (*info.om).abc;
238            debug!("Creating digital sequence from abc {:?}", abc_here);
239            debug!("K in abc is {}", (*abc_here).K);
240            libhmmer_sys::esl_sq_CreateDigital(abc_here)
241        };
242        debug!("dbsq created as digital, from abc {:?}", unsafe {
243            (*dbsq).abc
244        });
245
246        //   /* Main loop: */
247        //   while ( (n_targetseqs==-1 || seq_cnt<n_targetseqs) &&  (sstatus = esl_sqio_Read(dbfp, dbsq)) == eslOK)
248        //   {
249        sstatus = unsafe { libhmmer_sys::esl_sqio_Read(dbfp, dbsq) };
250        debug!("esl_sqio_Read returned {}", sstatus);
251        debug!("dbsq is {:?}", dbsq);
252        debug!("dbsq internals: {:#?}", EaselSequence { c_sq: dbsq });
253
254        while (n_targetseqs == -1 || seq_cnt < n_targetseqs)
255            && sstatus == libhmmer_sys::eslOK as i32
256        {
257            unsafe {
258                // p7_pli_NewSeq(info->pli, dbsq);
259                libhmmer_sys::p7_pli_NewSeq(info.pli, dbsq);
260
261                // p7_bg_SetLength(info->bg, dbsq->n);
262                libhmmer_sys::p7_bg_SetLength(
263                    info.bg,
264                    (*dbsq).n.try_into().expect("i64 -> i32 failed"),
265                );
266                // p7_oprofile_ReconfigLength(info->om, dbsq->n);
267                libhmmer_sys::p7_oprofile_ReconfigLength(
268                    info.om,
269                    (*dbsq).n.try_into().expect("i64 -> i32 failed"),
270                );
271
272                // p7_Pipeline(info->pli, info->om, info->bg, dbsq, NULL, info->th);
273                let p7_sstatus = libhmmer_sys::p7_Pipeline(
274                    info.pli,
275                    info.om,
276                    info.bg,
277                    dbsq,
278                    std::ptr::null_mut(),
279                    info.th,
280                );
281                if p7_sstatus != libhmmer_sys::eslOK as i32 {
282                    panic!("p7_Pipeline sstatus indicated failure, was {p7_sstatus}");
283                }
284
285                // In the C code, this is part of the while loop condition.
286                sstatus = libhmmer_sys::esl_sqio_Read(dbfp, dbsq);
287                debug!("esl_sqio_Read returned {sstatus}");
288            }
289
290            // seq_cnt++;
291            seq_cnt += 1;
292            // esl_sq_Reuse(dbsq);
293            // p7_pipeline_Reuse(info->pli);
294            unsafe {
295                libhmmer_sys::esl_sq_Reuse(dbsq);
296                libhmmer_sys::p7_pipeline_Reuse(info.pli);
297            }
298        }
299
300        // if (n_targetseqs!=-1 && seq_cnt==n_targetseqs)
301        // sstatus = eslEOF;
302        if n_targetseqs != -1 && seq_cnt == n_targetseqs {
303            sstatus = libhmmer_sys::eslEOF as i32;
304        }
305
306        // esl_sq_Destroy(dbsq);
307        unsafe {
308            libhmmer_sys::esl_sq_Destroy(dbsq);
309        }
310
311        sstatus
312    }
313
314    pub fn query(&mut self, easel_sequence: &EaselSequence) {
315        let info = &mut self.info;
316
317        unsafe {
318            // p7_pli_NewSeq(info->pli, dbsq);
319            if libhmmer_sys::p7_pli_NewSeq(info.pli, easel_sequence.c_sq)
320                != libhmmer_sys::eslOK as i32
321            {
322                panic!()
323            };
324
325            // p7_bg_SetLength(info->bg, dbsq->n);
326            if libhmmer_sys::p7_bg_SetLength(
327                info.bg,
328                (*easel_sequence.c_sq)
329                    .n
330                    .try_into()
331                    .expect("i64 -> i32 failed"),
332            ) != libhmmer_sys::eslOK as i32
333            {
334                panic!()
335            };
336            // p7_oprofile_ReconfigLength(info->om, dbsq->n);
337            if libhmmer_sys::p7_oprofile_ReconfigLength(
338                info.om,
339                (*easel_sequence.c_sq)
340                    .n
341                    .try_into()
342                    .expect("i64 -> i32 failed"),
343            ) != libhmmer_sys::eslOK as i32
344            {
345                panic!()
346            };
347
348            // p7_Pipeline(info->pli, info->om, info->bg, dbsq, NULL, info->th);
349            let sstatus = libhmmer_sys::p7_Pipeline(
350                info.pli,
351                info.om,
352                info.bg,
353                easel_sequence.c_sq,
354                std::ptr::null_mut(),
355                info.th,
356            );
357            debug!("query p7_Pipeline sstatus {}", sstatus);
358            if sstatus != libhmmer_sys::eslOK as i32 {
359                panic!("p7_Pipeline sstatus indicated failure, was {sstatus}");
360            }
361        }
362    }
363
364    pub fn get_results(&mut self) -> HmmsearchResult {
365        unsafe {
366            debug!("Running p7_tophits_SortBySortkey");
367            libhmmer_sys::p7_tophits_SortBySortkey(self.info.th);
368            debug!("Running p7_tophits_Threshold");
369            libhmmer_sys::p7_tophits_Threshold(self.info.th, self.info.pli);
370        }
371        HmmsearchResult {
372            c_th: self.info.th,
373            c_pli: self.info.pli,
374        }
375    }
376}
377
378// typedef struct {
379//       P7_BG            *bg;	         /* null model                              */
380//       P7_PIPELINE      *pli;         /* work pipeline                           */
381//       P7_TOPHITS       *th;          /* top hit results                         */
382//       P7_OPROFILE      *om;          /* optimized query profile                 */
383//     } WORKER_INFO;
384pub struct HmmsearchWorkerInfo {
385    bg: *mut libhmmer_sys::P7_BG,
386    pli: *mut libhmmer_sys::P7_PIPELINE,
387    th: *mut libhmmer_sys::P7_TOPHITS,
388    om: *mut libhmmer_sys::P7_OPROFILE,
389}
390
391#[derive(Debug)]
392pub struct HmmsearchResult {
393    pub c_th: *mut libhmmer_sys::p7_tophits_s,
394    pub c_pli: *mut libhmmer_sys::p7_pipeline_s,
395}
396
397impl Drop for HmmsearchResult {
398    fn drop(&mut self) {
399        unsafe {
400            libhmmer_sys::p7_tophits_Destroy(self.c_th);
401            libhmmer_sys::p7_pipeline_Destroy(self.c_pli);
402        }
403    }
404}
405
406impl HmmsearchResult {
407    /// Number of reported hits
408    pub fn nreported(&self) -> usize {
409        unsafe {
410            (*self.c_th)
411                .nreported
412                .try_into()
413                .expect("i64 -> usize failed")
414        }
415    }
416
417    pub fn hits(&self) -> HmmsearchResultTopHits {
418        HmmsearchResultTopHits {
419            c_th: self.c_th,
420            c_pli: self.c_pli,
421            current: 0,
422            nreported: self.nreported(),
423        }
424    }
425}
426
427pub struct HmmsearchResultTopHits {
428    c_th: *mut libhmmer_sys::p7_tophits_s,
429    c_pli: *mut libhmmer_sys::p7_pipeline_s,
430    current: usize,
431    nreported: usize,
432}
433
434impl Iterator for HmmsearchResultTopHits {
435    type Item = HmmsearchResultTopHit;
436
437    fn next(&mut self) -> Option<Self::Item> {
438        if self.current >= self.nreported {
439            return None;
440        }
441
442        let hit = HmmsearchResultTopHit {
443            c_hit: unsafe { *(*self.c_th).hit.add(self.current) },
444            c_pli: self.c_pli,
445            current_domain: 0,
446        };
447
448        self.current += 1;
449
450        Some(hit)
451    }
452}
453
454pub struct HmmsearchResultTopHit {
455    c_hit: *mut libhmmer_sys::p7_hit_s,
456    c_pli: *mut libhmmer_sys::p7_pipeline_s,
457    current_domain: usize,
458}
459
460impl HmmsearchResultTopHit {
461    // println!("Name of first hit {}", unsafe {
462    //     CStr::from_ptr(first_hit.name).to_string_lossy()
463    // });
464    pub fn name(&self) -> String {
465        unsafe {
466            CStr::from_ptr((*self.c_hit).name)
467                .to_string_lossy()
468                .into_owned()
469        }
470    }
471
472    // println!("Score of first hit overall {}", first_hit.score);
473    pub fn score(&self) -> f32 {
474        unsafe { (*self.c_hit).score }
475    }
476}
477
478impl Iterator for HmmsearchResultTopHit {
479    type Item = HmmsearchResultDomain;
480
481    fn next(&mut self) -> Option<Self::Item> {
482        // TODO: Check for end of hits
483        if self.current_domain >= unsafe { (*self.c_hit).ndom.try_into().unwrap() } {
484            return None;
485        }
486        println!("current domain counter {} ", self.current_domain);
487        let domain = HmmsearchResultDomain {
488            c_dom: unsafe { (*self.c_hit).dcl.add(self.current_domain) },
489            c_pli: self.c_pli,
490        };
491
492        self.current_domain += 1;
493
494        Some(domain)
495    }
496}
497
498pub struct HmmsearchResultDomain {
499    c_dom: *mut libhmmer_sys::p7_dom_s,
500    c_pli: *mut libhmmer_sys::p7_pipeline_s,
501}
502
503impl HmmsearchResultDomain {
504    // println!("First domain score: {}", first_domain.bitscore);
505    pub fn bitscore(&self) -> f32 {
506        unsafe { (*self.c_dom).bitscore }
507    }
508
509    // // exp(th->hit[h]->lnP) * pli->Z;
510    // let evalue = first_domain.lnP.exp() * unsafe { (*hmmsearch_result.c_pli).Z };
511    // println!("First domain evalue: {:?}", evalue);
512    pub fn evalue(&self) -> f64 {
513        unsafe { (*self.c_dom).lnP.exp() * (*self.c_pli).Z }
514    }
515}