rust_htslib/bgzf/
mod.rs

1// Copyright 2020 Manuel Landesfeind, Evotec International GmbH
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//!
7//! Module for working with bgzipped file.
8//!
9
10use std::ffi;
11use std::path::Path;
12use url::Url;
13
14use crate::htslib;
15use crate::tpool::ThreadPool;
16
17use crate::errors::{Error, Result};
18
19fn path_as_bytes<'a, P: 'a + AsRef<Path>>(path: P, must_exist: bool) -> Result<Vec<u8>> {
20    if path.as_ref().exists() || !must_exist {
21        Ok(path
22            .as_ref()
23            .to_str()
24            .ok_or(Error::NonUnicodePath)?
25            .as_bytes()
26            .to_owned())
27    } else {
28        Err(Error::FileNotFound {
29            path: path.as_ref().to_owned(),
30        })
31    }
32}
33
34/// Test if a file is a Bgzip compressed file
35///
36/// # Arguments
37///
38/// * `path` - the path to test.
39///
40/// # Returns:
41/// Will return `Ok(true)` or `Ok(false)` if the file at `path` is BGZIP compressed. Will return an `Err` in
42/// cases where no testing is possible.
43pub fn is_bgzip<P: AsRef<Path>>(path: P) -> Result<bool, Error> {
44    let byte_path = path_as_bytes(path, true)?;
45    let cpath = ffi::CString::new(byte_path).unwrap();
46    let is_bgzf = unsafe { htslib::bgzf_is_bgzf(cpath.as_ptr()) == 1 };
47    Ok(is_bgzf)
48}
49
50/// A reader that transparently reads uncompressed, gzip, and bgzip files.
51#[derive(Debug)]
52pub struct Reader {
53    inner: *mut htslib::BGZF,
54}
55
56impl Reader {
57    /// Create a new Reader to read from stdin.
58    pub fn from_stdin() -> Result<Self, Error> {
59        Self::new(b"-")
60    }
61
62    /// Create a new Reader from a path.
63    ///
64    /// # Arguments
65    ///
66    /// * `path` - the path to open.
67    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
68        Self::new(&path_as_bytes(path, true)?)
69    }
70
71    /// Create a new Reader from an URL.
72    ///
73    /// # Arguments
74    ///
75    /// * `url` - the url to open
76    pub fn from_url(url: &Url) -> Result<Self, Error> {
77        Self::new(url.as_str().as_bytes())
78    }
79
80    /// Internal function to create a Reader from some sort of path (could be file path but also URL).
81    /// The path or URL will be handled by the c-implementation transparently.
82    ///
83    /// # Arguments
84    ///
85    /// * `path` - the path or URL to open
86    fn new(path: &[u8]) -> Result<Self, Error> {
87        let mode = ffi::CString::new("r").unwrap();
88        let cpath = ffi::CString::new(path).unwrap();
89        let inner = unsafe { htslib::bgzf_open(cpath.as_ptr(), mode.as_ptr()) };
90        if !inner.is_null() {
91            Ok(Self { inner })
92        } else {
93            Err(Error::FileOpen {
94                path: String::from_utf8(path.to_vec()).unwrap(),
95            })
96        }
97    }
98
99    /// Set the thread pool to use for parallel decompression.
100    ///
101    /// # Arguments
102    ///
103    /// * `tpool` - the thread-pool to use
104    pub fn set_thread_pool(&mut self, tpool: &ThreadPool) -> Result<()> {
105        let b = tpool.handle.borrow_mut();
106        let r = unsafe {
107            htslib::bgzf_thread_pool(self.inner, b.inner.pool as *mut _, 0) // let htslib decide on the queue-size
108        };
109
110        if r != 0 {
111            Err(Error::ThreadPool)
112        } else {
113            Ok(())
114        }
115    }
116}
117
118impl std::io::Read for Reader {
119    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
120        let nbytes = unsafe {
121            htslib::bgzf_read(self.inner, buf.as_mut_ptr() as *mut libc::c_void, buf.len())
122        };
123        if nbytes < 0 {
124            Err(std::io::Error::other("Can not read"))
125        } else {
126            Ok(nbytes as usize)
127        }
128    }
129}
130
131/// The CompressionLevel used by the underlying GZIP writer
132/// Note that the special level NoCompression will not use the GZIP writer.
133/// Compression levels in BGZF files
134///
135/// * Uncompressed: No compression, zlib level 0
136/// * Fastest: Lowest compression level, zlib level 1
137/// * Maximum: Highest compression level, zlib level 9
138/// * Default: Default compression level, zlib level 6
139/// * Level(i): Custom compression level in the range [0, 9]
140/// * NoCompression: No compression, zlib not used. Output will be identical to input
141#[derive(Debug, Clone, Copy)]
142pub enum CompressionLevel {
143    Default,
144    NoCompression,
145    Uncompressed,
146    Fastest,
147    Maximum,
148    Level(i8),
149}
150impl CompressionLevel {
151    // Convert and check the variants of the `CompressionLevel` enum to a numeric level
152    fn convert(self) -> Result<i8> {
153        match self {
154            CompressionLevel::NoCompression => Ok(-2),
155            CompressionLevel::Default => Ok(-1),
156            CompressionLevel::Uncompressed => Ok(0),
157            CompressionLevel::Fastest => Ok(1),
158            CompressionLevel::Maximum => Ok(9),
159            CompressionLevel::Level(i @ -2..=9) => Ok(i),
160            CompressionLevel::Level(i) => Err(Error::BgzfInvalidCompressionLevel { level: i }),
161        }
162    }
163}
164
165/// A writer that writes uncompressed, gzip, and bgzip files.
166#[derive(Debug)]
167pub struct Writer {
168    inner: *mut htslib::BGZF,
169    tpool: Option<ThreadPool>,
170}
171
172impl Writer {
173    /// Create a new Writer to write to stdout with default compression.
174    pub fn from_stdout() -> Result<Self, Error> {
175        Self::from_stdout_with_compression(CompressionLevel::Default)
176    }
177
178    /// Create a new Writer to write to stdout with specific compression
179    ///
180    /// # Arguments
181    ///
182    /// * `level` the compression level to use
183    pub fn from_stdout_with_compression(level: CompressionLevel) -> Result<Self, Error> {
184        Self::new(b"-", level)
185    }
186
187    /// Create a new Writer from a path with default compression.
188    ///
189    /// # Arguments
190    ///
191    /// * `path` - the path to open.
192    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
193        Self::from_path_with_level(path, CompressionLevel::Default)
194    }
195
196    /// Create a new Writer from a path with a specific compression level.
197    ///
198    /// # Arguments
199    ///
200    /// * `path` - the path to open.
201    pub fn from_path_with_level<P: AsRef<Path>>(
202        path: P,
203        level: CompressionLevel,
204    ) -> Result<Self, Error> {
205        Self::new(&path_as_bytes(path, false)?, level)
206    }
207
208    /// Internal function to create a Writer from a file path
209    ///
210    /// # Arguments
211    ///
212    /// * `path` - the path or URL to open
213    fn new(path: &[u8], level: CompressionLevel) -> Result<Self, Error> {
214        let mode = Self::get_open_mode(level)?;
215        let cpath = ffi::CString::new(path).unwrap();
216        let inner = unsafe { htslib::bgzf_open(cpath.as_ptr(), mode.as_ptr()) };
217        if !inner.is_null() {
218            Ok(Self { inner, tpool: None })
219        } else {
220            Err(Error::FileOpen {
221                path: String::from_utf8(path.to_vec()).unwrap(),
222            })
223        }
224    }
225
226    /// Internal function to convert compression level to "mode"
227    /// bgzf.c expects mode for writers to be one of: 'w', 'wu', 'w#', where # is 0-9.
228    /// # Arguments
229    ///
230    /// * `level` - the level of compression to use
231    fn get_open_mode(level: CompressionLevel) -> Result<ffi::CString, Error> {
232        let write_string = match level.convert() {
233            Ok(-2) => "wu".to_string(),
234            Ok(-1) => "w".to_string(),
235            Ok(n @ 0..=9) => format!("w{}", n),
236            Err(e) => return Err(e),
237            // This should be unreachable
238            Ok(i) => return Err(Error::BgzfInvalidCompressionLevel { level: i }),
239        };
240        Ok(ffi::CString::new(write_string).unwrap())
241    }
242
243    /// Set the thread pool to use for parallel compression.
244    ///
245    /// # Arguments
246    ///
247    /// * `tpool` - the thread-pool to use
248    pub fn set_thread_pool(&mut self, tpool: &ThreadPool) -> Result<()> {
249        self.tpool = Some(tpool.clone());
250        let b = tpool.handle.borrow_mut();
251        let r = unsafe {
252            htslib::bgzf_thread_pool(self.inner, b.inner.pool as *mut _, 0) // let htslib decide on the queue-size
253        };
254
255        if r != 0 {
256            Err(Error::ThreadPool)
257        } else {
258            Ok(())
259        }
260    }
261}
262
263impl std::io::Write for Writer {
264    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
265        let nbytes =
266            unsafe { htslib::bgzf_write(self.inner, buf.as_ptr() as *mut libc::c_void, buf.len()) };
267        if nbytes < 0 {
268            Err(std::io::Error::other("Can not write"))
269        } else {
270            Ok(nbytes as usize)
271        }
272    }
273
274    fn flush(&mut self) -> std::io::Result<()> {
275        let exit_code: i32 = unsafe { htslib::bgzf_flush(self.inner) };
276        if exit_code == 0 {
277            Ok(())
278        } else {
279            Err(std::io::Error::other("Can not flush"))
280        }
281    }
282}
283
284impl std::ops::Drop for Writer {
285    fn drop(&mut self) {
286        unsafe {
287            htslib::bgzf_close(self.inner);
288        }
289    }
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295    use std::io::Read;
296    use std::io::Write;
297
298    // Define paths to the test files
299    const FN_PLAIN: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/test/bgzip/plain.vcf");
300    const FN_GZIP: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/test/bgzip/gzip.vcf.gz");
301    const FN_BGZIP: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/test/bgzip/bgzip.vcf.gz");
302
303    const CONTENT: &str = include_str!("../../test/bgzip/plain.vcf");
304
305    #[test]
306    fn test_is_bgzip_plain() {
307        assert!(
308            !is_bgzip(FN_PLAIN).unwrap(),
309            "Plain file not detected as BGZIP"
310        );
311        assert!(
312            !is_bgzip(FN_GZIP).unwrap(),
313            "Zip file not detected as BGZIP"
314        );
315        assert!(is_bgzip(FN_BGZIP).unwrap(), "Bgzip file detected as BGZIP");
316    }
317
318    #[test]
319    fn test_open_plain() {
320        let r_result = Reader::from_path(FN_PLAIN);
321        assert!(r_result.is_ok(), "Open plain file with Bgzip reader");
322
323        let mut my_content = String::new();
324        let reading_result = r_result.unwrap().read_to_string(&mut my_content);
325        assert!(
326            reading_result.is_ok(),
327            "Reading plain file into buffer is ok"
328        );
329        assert_eq!(
330            reading_result.unwrap(),
331            190,
332            "Reading plain file into buffer is correct size"
333        );
334        assert_eq!(
335            my_content, CONTENT,
336            "Reading plain file with correct content"
337        );
338    }
339
340    #[test]
341    fn test_open_gzip() {
342        let r_result = Reader::from_path(FN_GZIP);
343        assert!(r_result.is_ok(), "Open gzip file with Bgzip reader");
344
345        let mut my_content = String::new();
346        let reading_result = r_result.unwrap().read_to_string(&mut my_content);
347        assert!(
348            reading_result.is_ok(),
349            "Reading gzip file into buffer is ok"
350        );
351        assert_eq!(
352            reading_result.unwrap(),
353            190,
354            "Reading gzip file into buffer is correct size"
355        );
356        assert_eq!(
357            my_content, CONTENT,
358            "Reading gzip file with correct content"
359        );
360    }
361
362    #[test]
363    fn test_open_bgzip() {
364        let r_result = Reader::from_path(FN_BGZIP);
365        assert!(r_result.is_ok(), "Open bgzip file with Bgzip reader");
366
367        let mut my_content = String::new();
368        let reading_result = r_result.unwrap().read_to_string(&mut my_content);
369        assert!(
370            reading_result.is_ok(),
371            "Reading bgzip file into buffer is ok"
372        );
373        assert_eq!(
374            reading_result.unwrap(),
375            190,
376            "Reading bgzip file into buffer is correct size"
377        );
378        assert_eq!(
379            my_content, CONTENT,
380            "Reading bgzip file with correct content"
381        );
382    }
383    #[test]
384    fn test_set_threadpool() {
385        let r_result = Reader::from_path(FN_BGZIP);
386        assert!(r_result.is_ok(), "Open bgzip file with Bgzip reader");
387        let mut r = r_result.unwrap();
388
389        let tpool_result = ThreadPool::new(5);
390        assert!(tpool_result.is_ok(), "Creating thread pool");
391        let tpool = tpool_result.unwrap();
392
393        let set_result = r.set_thread_pool(&tpool);
394        assert_eq!(set_result, Ok(()), "Setting thread pool okay");
395
396        let mut my_content = String::new();
397        let reading_result = r.read_to_string(&mut my_content);
398        assert!(
399            reading_result.is_ok(),
400            "Reading bgzip file into buffer is ok - using a threadpool"
401        );
402        assert_eq!(
403            reading_result.unwrap(),
404            190,
405            "Reading bgzip file into buffer is correct size using a threadpool"
406        );
407        assert_eq!(
408            my_content, CONTENT,
409            "Reading bgzip file with correct content using a threadpool"
410        );
411    }
412
413    #[test]
414    fn test_write_plain() {
415        let tmp = tempfile::Builder::new()
416            .prefix("rust-htslib")
417            .tempdir()
418            .expect("Cannot create temp dir");
419        let out_path = tmp.path().join("test.vcf");
420        println!("{:?}", out_path);
421
422        {
423            let w_result = Writer::from_path_with_level(&out_path, CompressionLevel::NoCompression);
424            if let Err(ref e) = w_result {
425                println!("w_result is {}", e);
426            }
427            assert!(w_result.is_ok(), "Create plain file with Bgzip writer");
428            assert!(out_path.exists(), "Plain file is created with Bgzip writer");
429            let mut w = w_result.unwrap();
430            let write_result = w.write_all(CONTENT.as_bytes());
431            assert!(
432                write_result.is_ok(),
433                "Plain file can write with Bgzip writer"
434            );
435        } // let Writer fall out of scope and implicitly close
436        assert!(
437            !is_bgzip(&out_path).unwrap(),
438            "NoCompression file should not be detected as BGZIP"
439        );
440        let my_content = std::fs::read_to_string(&out_path).unwrap();
441        assert_eq!(
442            my_content, CONTENT,
443            "Writing bgzip file with no compression"
444        );
445
446        tmp.close().expect("Failed to delete temp dir");
447    }
448
449    #[test]
450    fn test_write_default() {
451        let tmp = tempfile::Builder::new()
452            .prefix("rust-htslib")
453            .tempdir()
454            .expect("Cannot create temp dir");
455        let out_path = tmp.path().join("test.vcf.bgzf");
456        println!("{:?}", out_path);
457        {
458            let w_result = Writer::from_path(&out_path);
459            if let Err(ref e) = w_result {
460                println!("w_result is {}", e);
461            }
462            assert!(w_result.is_ok(), "Create bgzip file with Bgzip writer");
463            assert!(
464                std::path::Path::new(&out_path).exists(),
465                "Bgzip file is created with Bgzip writer"
466            );
467            let mut w = w_result.unwrap();
468            let write_result = w.write_all(CONTENT.as_bytes());
469            assert!(
470                write_result.is_ok(),
471                "Bgzip file can write with Bgzip writer"
472            );
473        } // let Writer fall out of scope and implicitly close
474
475        // Read in with bgzip reader
476        let mut my_content = String::new();
477        Reader::from_path(&out_path)
478            .unwrap()
479            .read_to_string(&mut my_content)
480            .unwrap();
481        assert_eq!(
482            my_content, CONTENT,
483            "Writing bgzip file with default compression"
484        );
485
486        assert!(
487            is_bgzip(&out_path).unwrap(),
488            "Default BGZIP file detected as BGZIP"
489        );
490        tmp.close().expect("Failed to delete temp dir");
491    }
492
493    #[test]
494    fn test_write_compression_levels() {
495        let tmp = tempfile::Builder::new()
496            .prefix("rust-htslib")
497            .tempdir()
498            .expect("Cannot create temp dir");
499        let out_path = tmp.path().join("test.vcf.bgzf");
500
501        // Test all levels except NoCompression
502        let compression_levels = vec![
503            CompressionLevel::Fastest,
504            CompressionLevel::Maximum,
505            CompressionLevel::Uncompressed,
506        ]
507        .into_iter()
508        .chain((-1..=9_i8).map(CompressionLevel::Level));
509
510        for level in compression_levels {
511            {
512                let w_result = Writer::from_path_with_level(&out_path, level);
513                if let Err(ref e) = w_result {
514                    println!("w_result is {}", e);
515                }
516                assert!(w_result.is_ok(), "Create bgzip file with Bgzip writer");
517                assert!(
518                    std::path::Path::new(&out_path).exists(),
519                    "Bgzip file is created with Bgzip writer"
520                );
521                let mut w = w_result.unwrap();
522                let write_result = w.write_all(CONTENT.as_bytes());
523                assert!(
524                    write_result.is_ok(),
525                    "Bgzip file can write with Bgzip writer"
526                );
527            } // let Writer fall out of scope and implicitly close
528
529            // Read in with bgzip reader
530            let mut my_content = String::new();
531            Reader::from_path(&out_path)
532                .unwrap()
533                .read_to_string(&mut my_content)
534                .unwrap();
535            assert_eq!(
536                my_content, CONTENT,
537                "Writing bgzip file with {:?} compression",
538                level
539            );
540
541            assert!(
542                is_bgzip(&out_path).unwrap(),
543                "Writing BGZIP file with {:?} compression detected as BGZIP",
544                level
545            );
546        }
547        tmp.close().expect("Failed to delete temp dir");
548    }
549
550    #[test]
551    fn test_write_with_threadpool() {
552        let tmp = tempfile::Builder::new()
553            .prefix("rust-htslib")
554            .tempdir()
555            .expect("Cannot create temp dir");
556        let out_path = tmp.path().join("test.vcf.bgzf");
557
558        let content = CONTENT.as_bytes();
559        println!("{:?}", out_path);
560        {
561            let w_result = Writer::from_path(&out_path);
562            if let Err(ref e) = w_result {
563                println!("w_result is {}", e);
564            }
565            assert!(w_result.is_ok(), "Create bgzip file with Bgzip threadpool");
566            assert!(
567                std::path::Path::new(&out_path).exists(),
568                "Bgzip file is created with Bgzip threadpool"
569            );
570
571            let mut w = w_result.unwrap();
572            let tpool_result = ThreadPool::new(5);
573            assert!(tpool_result.is_ok(), "Creating thread pool");
574            let tpool = tpool_result.unwrap();
575
576            let set_tpool_result = w.set_thread_pool(&tpool);
577            assert!(set_tpool_result.is_ok(), "Setting thread pool");
578
579            let write_result = w.write_all(content);
580            assert!(
581                write_result.is_ok(),
582                "Bgzip file can write with Bgzip threadpool"
583            );
584        } // let Writer fall out of scope and implicitly close
585
586        // Read in with bgzip reader
587        let mut my_content = String::new();
588        Reader::from_path(&out_path)
589            .unwrap()
590            .read_to_string(&mut my_content)
591            .unwrap();
592        assert_eq!(my_content, CONTENT, "Writing bgzip file with threadpool");
593
594        assert!(
595            is_bgzip(&out_path).unwrap(),
596            "Threadpool BGZIP file detected as BGZIP"
597        );
598
599        tmp.close().expect("Failed to delete temp dir");
600    }
601}