hmmer_rs/hmmsearch.rs
1use log::*;
2use std::ffi::{CStr, CString};
3
4use crate::{hmm::*, libhmmer_sys_extras, EaselSequence};
5
6pub struct HmmerPipeline {
7 info: HmmsearchWorkerInfo,
8}
9
10impl HmmerPipeline {
11 pub fn new(hmm: &Hmm) -> HmmerPipeline {
12 // P7_PROFILE *gm = NULL;
13 // P7_OPROFILE *om = NULL; /* optimized query profile */
14 #[allow(unused_assignments)]
15 let mut gm: *mut libhmmer_sys::P7_PROFILE = std::ptr::null_mut();
16 #[allow(unused_assignments)]
17 let mut om: *mut libhmmer_sys::P7_OPROFILE = std::ptr::null_mut();
18 // int textw = 0;
19
20 // WORKER_INFO *info = NULL;
21 let bg = unsafe { libhmmer_sys::p7_bg_Create(hmm.c_alphabet()) };
22 debug!("Background model created successfully");
23
24 // /* Convert to an optimized model */
25 // gm = p7_profile_Create (hmm->M, abc);
26 // om = p7_oprofile_Create(hmm->M, abc);
27 // p7_ProfileConfig(hmm, info->bg, gm, 100, p7_LOCAL); /* 100 is a dummy length for now; and MSVFilter requires local mode */
28 // p7_oprofile_Convert(gm, om); /* <om> is now p7_LOCAL, multihit */
29 let abc = hmm.c_alphabet();
30 gm = unsafe { libhmmer_sys::p7_profile_Create(hmm.length() as i32, abc) };
31 debug!("Profile created successfully");
32 om = unsafe { libhmmer_sys::p7_oprofile_Create(hmm.length() as i32, abc) };
33 debug!("Optimized profile created successfully");
34
35 unsafe {
36 libhmmer_sys::p7_ProfileConfig(hmm.c_hmm, bg, gm, 100, libhmmer_sys_extras::p7_LOCAL);
37 debug!("Profile configured successfully");
38 libhmmer_sys::p7_oprofile_Convert(gm, om);
39 debug!("Optimized profile converted successfully");
40 }
41
42 // /* Create processing pipeline and hit list */
43 // info[i].th = p7_tophits_Create();
44 // info[i].om = p7_oprofile_Clone(om);
45 // info[i].pli = p7_pipeline_Create(go, om->M, 100, FALSE, p7_SEARCH_SEQS); /* L_hint = 100 is just a dummy for now */
46 // status = p7_pli_NewModel(info[i].pli, info[i].om, info[i].bg);
47 // if (status == eslEINVAL) p7_Fail(info->pli->errbuf);
48 let th = unsafe { libhmmer_sys::p7_tophits_Create() };
49 debug!("Tophits created successfully");
50 // let om = libhmmer_sys::p7_oprofile_Clone(om);
51 let pli = unsafe {
52 libhmmer_sys::p7_pipeline_Create(
53 std::ptr::null_mut(),
54 (*om).M,
55 100,
56 0,
57 libhmmer_sys_extras::p7_SEARCH_SEQS as u32,
58 )
59 };
60 debug!("Pipeline created successfully");
61 unsafe {
62 let status = libhmmer_sys::p7_pli_NewModel(pli, om, bg);
63 if status == libhmmer_sys::eslEINVAL as i32 {
64 panic!(); // TODO: Better msg
65 }
66 }
67 debug!("Pipeline new model created successfully");
68
69 let info = HmmsearchWorkerInfo { th, om, pli, bg };
70
71 HmmerPipeline { info }
72 }
73
74 pub fn run_hmm_on_file(&mut self, hmm: &Hmm, fasta_path: &std::path::Path) -> HmmsearchResult {
75 debug!("Starting run_hmm_on_file");
76 #[allow(unused_mut)]
77 let mut dbfile = Self::open_target_sequences(&fasta_path.to_string_lossy());
78 debug!("Target sequences opened successfully");
79
80 // TODO: output_header(ofp, go, cfg->hmmfile, cfg->dbfile);
81
82 // esl_sqfile_SetDigital(dbfp, abc); //ReadBlock requires knowledge of the alphabet to decide how best to read blocks
83 unsafe {
84 libhmmer_sys::esl_sqfile_SetDigital(dbfile, hmm.c_alphabet());
85 }
86 debug!("Target sequences set to digital successfully");
87
88 // if (fprintf(ofp, "Query: %s [M=%d]\n", hmm->name, hmm->M) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
89 // if (hmm->acc) { if (fprintf(ofp, "Accession: %s\n", hmm->acc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
90 // if (hmm->desc) { if (fprintf(ofp, "Description: %s\n", hmm->desc) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); }
91 println!("Query: {} [M={}]", hmm.name(), hmm.length());
92 println!("Accession: {}", hmm.acc());
93 println!("Description: {}", hmm.desc());
94
95 // sstatus = serial_loop(info, dbfp, cfg->n_targetseq);
96 // TODO: n_targetseq == -1 means no limit. OK for now.
97 let sstatus = self.serial_loop_over_esl_sqio(dbfile, -1);
98
99 // switch(sstatus)
100 // {
101 // case eslEFORMAT:
102 // esl_fatal("Parse failed (sequence file %s):\n%s\n",
103 // dbfp->filename, esl_sqfile_GetErrorBuf(dbfp));
104 // break;
105 // case eslEOF:
106 // /* do nothing */
107 // break;
108 // default:
109 // esl_fatal("Unexpected error %d reading sequence file %s", sstatus, dbfp->filename);
110 // }
111
112 // Unclear to me why restating these consts is necessary to avoid
113 // compile-time warnings.
114 const ESL_FORMAT: i32 = libhmmer_sys::eslEFORMAT as i32;
115 const ESL_EOF: i32 = libhmmer_sys::eslEOF as i32;
116 match sstatus {
117 ESL_FORMAT => {
118 // panic!("Parse failed (sequence file {}):\n{}", fasta_path.to_string_lossy(), unsafe { libhmmer_sys::esl_sqfile_GetErrorBuf(dbfile) });
119 // TODO: Make the above compile
120 panic!(
121 "Parse failed (sequence file {})",
122 fasta_path.to_string_lossy()
123 );
124 }
125 ESL_EOF => {
126 // do nothing
127 }
128 _ => {
129 panic!(
130 "Unexpected error {} reading sequence file {}",
131 sstatus,
132 fasta_path.to_string_lossy()
133 );
134 }
135 }
136
137 // /* merge the results of the search results */
138 // for (i = 1; i < infocnt; ++i)
139 // {
140 // p7_tophits_Merge(info[0].th, info[i].th);
141 // p7_pipeline_Merge(info[0].pli, info[i].pli);
142
143 // p7_pipeline_Destroy(info[i].pli);
144 // p7_tophits_Destroy(info[i].th);
145 // p7_oprofile_Destroy(info[i].om);
146 // }
147
148 // ===> This block of code is not necessary because we only have one thread.
149
150 // /* Print the results. */
151 // p7_tophits_SortBySortkey(info->th);
152 // p7_tophits_Threshold(info->th, info->pli);
153 // p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
154 // p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
155 unsafe {
156 libhmmer_sys::p7_tophits_SortBySortkey(self.info.th);
157 libhmmer_sys::p7_tophits_Threshold(self.info.th, self.info.pli);
158
159 // TODO: Implement (optional) output of default output.
160 // panic!("Need to get FILE* from fopen or stdout to do this");
161 // libhmmer_sys::p7_tophits_Targets(FIXME, (*info).th, (*info).pli, textw);
162 // libhmmer_sys::p7_tophits_Domains(FIXME, (*info).th, (*info).pli, textw);
163 }
164
165 // if (tblfp) p7_tophits_TabularTargets(tblfp, hmm->name, hmm->acc, info->th, info->pli, (nquery == 1));
166 // if (domtblfp) p7_tophits_TabularDomains(domtblfp, hmm->name, hmm->acc, info->th, info->pli, (nquery == 1));
167 // if (pfamtblfp) p7_tophits_TabularXfam(pfamtblfp, hmm->name, hmm->acc, info->th, info->pli);
168 // TODO: The above. I don't need them for now.
169
170 // TODO: Destroy, free, etc.
171
172 HmmsearchResult {
173 c_th: self.info.th,
174 c_pli: self.info.pli,
175 }
176 }
177
178 // TODO: Add a coresponding free() method
179 fn open_target_sequences(fasta_file: &str) -> *mut libhmmer_sys::esl_sqio_s {
180 // int dbfmt = eslSQFILE_UNKNOWN; /* format code for sequence database file */
181 let dbfmt = 0; //libhmmer_sys::eslSQFILE_UNKNOWN;
182
183 // ESL_SQFILE *dbfp = NULL; /* open input sequence file */
184 let mut dbfp: *mut libhmmer_sys::ESL_SQFILE = std::ptr::null_mut();
185
186 // /* Open the target sequence database */
187 // status = esl_sqfile_Open(cfg->dbfile, dbfmt, p7_SEQDBENV, &dbfp);
188 // if (status == eslENOTFOUND) p7_Fail("Failed to open sequence file %s for reading\n", cfg->dbfile);
189 // else if (status == eslEFORMAT) p7_Fail("Sequence file %s is empty or misformatted\n", cfg->dbfile);
190 // else if (status == eslEINVAL) p7_Fail("Can't autodetect format of a stdin or .gz seqfile");
191 // else if (status != eslOK) p7_Fail("Unexpected error %d opening sequence file %s\n", status, cfg->dbfile);
192 let file_pointer = CString::new(fasta_file.as_bytes()).unwrap().into_raw();
193 let status = unsafe {
194 // Open the file not assuming anything about its format, and let
195 // autodetect do its thing. Possibly we should use eslSQFILE_FASTA.
196 libhmmer_sys::esl_sqfile_Open(file_pointer, dbfmt, std::ptr::null(), &mut dbfp)
197 };
198 println!("Opened fasta file with status {status}");
199
200 if status == libhmmer_sys::eslENOTFOUND as i32 {
201 panic!("Failed to open sequence file {fasta_file} for reading");
202 } else if status == libhmmer_sys::eslEFORMAT as i32 {
203 panic!("Sequence file {fasta_file} is empty or misformatted");
204 } else if status == libhmmer_sys::eslEINVAL as i32 {
205 panic!("Can't autodetect format of a stdin or .gz seqfile");
206 } else if status != libhmmer_sys::eslOK as i32 {
207 panic!("Unexpected error {status} opening sequence file {fasta_file}");
208 }
209 dbfp
210 }
211
212 /// This method (called serial_loop in C) is not available in libhmmer_sys,
213 /// so we have to implement it here. Intended as a direct replacement for
214 /// the C function.
215 fn serial_loop_over_esl_sqio(
216 &mut self,
217 dbfp: *mut libhmmer_sys::esl_sqio_s,
218 n_targetseqs: i32,
219 ) -> i32 {
220 debug!("serial_loop");
221
222 // int status; /* easel return code */
223 let mut sstatus: i32;
224
225 // ESL_SQ *dbsq = NULL; /* one target sequence (digital) */
226 #[allow(unused_mut, unused_assignments)]
227 let mut dbsq: *mut libhmmer_sys::ESL_SQ = std::ptr::null_mut();
228
229 // int seq_cnt = 0;
230 let mut seq_cnt: i32 = 0;
231
232 // For convenience
233 let info = &mut self.info;
234
235 // dbsq = esl_sq_CreateDigital(info->om->abc);
236 dbsq = unsafe {
237 let abc_here = (*info.om).abc;
238 debug!("Creating digital sequence from abc {:?}", abc_here);
239 debug!("K in abc is {}", (*abc_here).K);
240 libhmmer_sys::esl_sq_CreateDigital(abc_here)
241 };
242 debug!("dbsq created as digital, from abc {:?}", unsafe {
243 (*dbsq).abc
244 });
245
246 // /* Main loop: */
247 // while ( (n_targetseqs==-1 || seq_cnt<n_targetseqs) && (sstatus = esl_sqio_Read(dbfp, dbsq)) == eslOK)
248 // {
249 sstatus = unsafe { libhmmer_sys::esl_sqio_Read(dbfp, dbsq) };
250 debug!("esl_sqio_Read returned {}", sstatus);
251 debug!("dbsq is {:?}", dbsq);
252 debug!("dbsq internals: {:#?}", EaselSequence { c_sq: dbsq });
253
254 while (n_targetseqs == -1 || seq_cnt < n_targetseqs)
255 && sstatus == libhmmer_sys::eslOK as i32
256 {
257 unsafe {
258 // p7_pli_NewSeq(info->pli, dbsq);
259 libhmmer_sys::p7_pli_NewSeq(info.pli, dbsq);
260
261 // p7_bg_SetLength(info->bg, dbsq->n);
262 libhmmer_sys::p7_bg_SetLength(
263 info.bg,
264 (*dbsq).n.try_into().expect("i64 -> i32 failed"),
265 );
266 // p7_oprofile_ReconfigLength(info->om, dbsq->n);
267 libhmmer_sys::p7_oprofile_ReconfigLength(
268 info.om,
269 (*dbsq).n.try_into().expect("i64 -> i32 failed"),
270 );
271
272 // p7_Pipeline(info->pli, info->om, info->bg, dbsq, NULL, info->th);
273 let p7_sstatus = libhmmer_sys::p7_Pipeline(
274 info.pli,
275 info.om,
276 info.bg,
277 dbsq,
278 std::ptr::null_mut(),
279 info.th,
280 );
281 if p7_sstatus != libhmmer_sys::eslOK as i32 {
282 panic!("p7_Pipeline sstatus indicated failure, was {p7_sstatus}");
283 }
284
285 // In the C code, this is part of the while loop condition.
286 sstatus = libhmmer_sys::esl_sqio_Read(dbfp, dbsq);
287 debug!("esl_sqio_Read returned {sstatus}");
288 }
289
290 // seq_cnt++;
291 seq_cnt += 1;
292 // esl_sq_Reuse(dbsq);
293 // p7_pipeline_Reuse(info->pli);
294 unsafe {
295 libhmmer_sys::esl_sq_Reuse(dbsq);
296 libhmmer_sys::p7_pipeline_Reuse(info.pli);
297 }
298 }
299
300 // if (n_targetseqs!=-1 && seq_cnt==n_targetseqs)
301 // sstatus = eslEOF;
302 if n_targetseqs != -1 && seq_cnt == n_targetseqs {
303 sstatus = libhmmer_sys::eslEOF as i32;
304 }
305
306 // esl_sq_Destroy(dbsq);
307 unsafe {
308 libhmmer_sys::esl_sq_Destroy(dbsq);
309 }
310
311 sstatus
312 }
313
314 pub fn query(&mut self, easel_sequence: &EaselSequence) {
315 let info = &mut self.info;
316
317 unsafe {
318 // p7_pli_NewSeq(info->pli, dbsq);
319 if libhmmer_sys::p7_pli_NewSeq(info.pli, easel_sequence.c_sq)
320 != libhmmer_sys::eslOK as i32
321 {
322 panic!()
323 };
324
325 // p7_bg_SetLength(info->bg, dbsq->n);
326 if libhmmer_sys::p7_bg_SetLength(
327 info.bg,
328 (*easel_sequence.c_sq)
329 .n
330 .try_into()
331 .expect("i64 -> i32 failed"),
332 ) != libhmmer_sys::eslOK as i32
333 {
334 panic!()
335 };
336 // p7_oprofile_ReconfigLength(info->om, dbsq->n);
337 if libhmmer_sys::p7_oprofile_ReconfigLength(
338 info.om,
339 (*easel_sequence.c_sq)
340 .n
341 .try_into()
342 .expect("i64 -> i32 failed"),
343 ) != libhmmer_sys::eslOK as i32
344 {
345 panic!()
346 };
347
348 // p7_Pipeline(info->pli, info->om, info->bg, dbsq, NULL, info->th);
349 let sstatus = libhmmer_sys::p7_Pipeline(
350 info.pli,
351 info.om,
352 info.bg,
353 easel_sequence.c_sq,
354 std::ptr::null_mut(),
355 info.th,
356 );
357 debug!("query p7_Pipeline sstatus {}", sstatus);
358 if sstatus != libhmmer_sys::eslOK as i32 {
359 panic!("p7_Pipeline sstatus indicated failure, was {sstatus}");
360 }
361 }
362 }
363
364 pub fn get_results(&mut self) -> HmmsearchResult {
365 unsafe {
366 debug!("Running p7_tophits_SortBySortkey");
367 libhmmer_sys::p7_tophits_SortBySortkey(self.info.th);
368 debug!("Running p7_tophits_Threshold");
369 libhmmer_sys::p7_tophits_Threshold(self.info.th, self.info.pli);
370 }
371 HmmsearchResult {
372 c_th: self.info.th,
373 c_pli: self.info.pli,
374 }
375 }
376}
377
378// typedef struct {
379// P7_BG *bg; /* null model */
380// P7_PIPELINE *pli; /* work pipeline */
381// P7_TOPHITS *th; /* top hit results */
382// P7_OPROFILE *om; /* optimized query profile */
383// } WORKER_INFO;
384pub struct HmmsearchWorkerInfo {
385 bg: *mut libhmmer_sys::P7_BG,
386 pli: *mut libhmmer_sys::P7_PIPELINE,
387 th: *mut libhmmer_sys::P7_TOPHITS,
388 om: *mut libhmmer_sys::P7_OPROFILE,
389}
390
391#[derive(Debug)]
392pub struct HmmsearchResult {
393 pub c_th: *mut libhmmer_sys::p7_tophits_s,
394 pub c_pli: *mut libhmmer_sys::p7_pipeline_s,
395}
396
397impl Drop for HmmsearchResult {
398 fn drop(&mut self) {
399 unsafe {
400 libhmmer_sys::p7_tophits_Destroy(self.c_th);
401 libhmmer_sys::p7_pipeline_Destroy(self.c_pli);
402 }
403 }
404}
405
406impl HmmsearchResult {
407 /// Number of reported hits
408 pub fn nreported(&self) -> usize {
409 unsafe {
410 (*self.c_th)
411 .nreported
412 .try_into()
413 .expect("i64 -> usize failed")
414 }
415 }
416
417 pub fn hits(&self) -> HmmsearchResultTopHits {
418 HmmsearchResultTopHits {
419 c_th: self.c_th,
420 c_pli: self.c_pli,
421 current: 0,
422 nreported: self.nreported(),
423 }
424 }
425}
426
427pub struct HmmsearchResultTopHits {
428 c_th: *mut libhmmer_sys::p7_tophits_s,
429 c_pli: *mut libhmmer_sys::p7_pipeline_s,
430 current: usize,
431 nreported: usize,
432}
433
434impl Iterator for HmmsearchResultTopHits {
435 type Item = HmmsearchResultTopHit;
436
437 fn next(&mut self) -> Option<Self::Item> {
438 if self.current >= self.nreported {
439 return None;
440 }
441
442 let hit = HmmsearchResultTopHit {
443 c_hit: unsafe { *(*self.c_th).hit.add(self.current) },
444 c_pli: self.c_pli,
445 current_domain: 0,
446 };
447
448 self.current += 1;
449
450 Some(hit)
451 }
452}
453
454pub struct HmmsearchResultTopHit {
455 c_hit: *mut libhmmer_sys::p7_hit_s,
456 c_pli: *mut libhmmer_sys::p7_pipeline_s,
457 current_domain: usize,
458}
459
460impl HmmsearchResultTopHit {
461 // println!("Name of first hit {}", unsafe {
462 // CStr::from_ptr(first_hit.name).to_string_lossy()
463 // });
464 pub fn name(&self) -> String {
465 unsafe {
466 CStr::from_ptr((*self.c_hit).name)
467 .to_string_lossy()
468 .into_owned()
469 }
470 }
471
472 // println!("Score of first hit overall {}", first_hit.score);
473 pub fn score(&self) -> f32 {
474 unsafe { (*self.c_hit).score }
475 }
476}
477
478impl Iterator for HmmsearchResultTopHit {
479 type Item = HmmsearchResultDomain;
480
481 fn next(&mut self) -> Option<Self::Item> {
482 // TODO: Check for end of hits
483 if self.current_domain >= unsafe { (*self.c_hit).ndom.try_into().unwrap() } {
484 return None;
485 }
486 println!("current domain counter {} ", self.current_domain);
487 let domain = HmmsearchResultDomain {
488 c_dom: unsafe { (*self.c_hit).dcl.add(self.current_domain) },
489 c_pli: self.c_pli,
490 };
491
492 self.current_domain += 1;
493
494 Some(domain)
495 }
496}
497
498pub struct HmmsearchResultDomain {
499 c_dom: *mut libhmmer_sys::p7_dom_s,
500 c_pli: *mut libhmmer_sys::p7_pipeline_s,
501}
502
503impl HmmsearchResultDomain {
504 // println!("First domain score: {}", first_domain.bitscore);
505 pub fn bitscore(&self) -> f32 {
506 unsafe { (*self.c_dom).bitscore }
507 }
508
509 // // exp(th->hit[h]->lnP) * pli->Z;
510 // let evalue = first_domain.lnP.exp() * unsafe { (*hmmsearch_result.c_pli).Z };
511 // println!("First domain evalue: {:?}", evalue);
512 pub fn evalue(&self) -> f64 {
513 unsafe { (*self.c_dom).lnP.exp() * (*self.c_pli).Z }
514 }
515}