Skip to main content

rust_htslib/bcf/
header.rs

1// Copyright 2014 Johannes Köster.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5//! Module for working with VCF or BCF headers.
6//!
7//! # Examples
8//! From the header of a VCF file we can
9//!   - Output sample count of a VCF file
10//!   - Output sample names of a VCF file
11//!   - Output sample index given a sample name of a VCF file.
12//! ```
13//! use crate::rust_htslib::bcf::{Reader, Read};
14//! use std::io::Read as IoRead;
15//!
16//! let path = &"test/test_string.vcf";
17//! let mut bcf = Reader::from_path(path).expect("Error opening file.");
18//! let header = bcf.header();
19//! assert_eq!(header.sample_count(), 2);  // Sample count
20//! let mut s = String::new();
21//! for (i, mut x) in header.samples().into_iter().enumerate() {
22//!     x.read_to_string(&mut s);  // Read sample name in to `s`
23//!     println!("{}", s);  // output sample name
24//! }
25//! assert_eq!(header.sample_id(b"one").unwrap(), 0);  // Sample index wrapped in Option<usize>
26//! assert_eq!(header.sample_id(b"two").unwrap(), 1);  // Sample index wrapped in Option<usize>
27//! assert!(header.sample_id(b"non existent sample").is_none());  // Return none if not found
28//!
29//! assert_eq!(header.contig_count(), 1); // Number of contig in header.
30//! // obtain the data type of an INFO field
31//! let (tag_type, tag_length) = header.info_type(b"S1").unwrap();
32//! let (fmt_type, fmt_length) = header.format_type(b"GT").unwrap();
33//! ```
34
35use std::ffi;
36use std::os::raw::c_char;
37use std::slice;
38use std::str;
39use std::sync::Arc;
40
41use crate::htslib;
42
43use linear_map::LinearMap;
44
45use crate::errors::{Error, Result};
46
47pub type SampleSubset = Vec<i32>;
48
49custom_derive! {
50    /// A newtype for IDs from BCF headers.
51    #[derive(
52        NewtypeFrom,
53        NewtypeDeref,
54        PartialEq,
55        PartialOrd,
56        Eq,
57        Ord,
58        Copy,
59        Clone,
60        Debug
61    )]
62    pub struct Id(pub u32);
63}
64
65/// A BCF header.
66#[derive(Debug)]
67pub struct Header {
68    pub(crate) inner: *mut htslib::bcf_hdr_t,
69    pub subset: Option<SampleSubset>,
70}
71
72unsafe impl Send for Header {}
73unsafe impl Sync for Header {}
74
75impl Default for Header {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81impl Header {
82    /// Create a new (empty) `Header`.
83    pub fn new() -> Self {
84        let c_str = ffi::CString::new(&b"w"[..]).unwrap();
85        Header {
86            inner: unsafe { htslib::bcf_hdr_init(c_str.as_ptr()) },
87            subset: None,
88        }
89    }
90
91    /// Get a pointer to the raw header.
92    ///
93    /// # Safety
94    /// The caller must ensure that the pointer is not used after this `Header`
95    /// is dropped
96    pub unsafe fn inner_ptr(&self) -> *mut htslib::bcf_hdr_t {
97        self.inner
98    }
99
100    /// Create a new `Header` using the given `HeaderView` as the template.
101    ///
102    /// After construction, you can modify the header independently from the template `header`.
103    ///
104    /// # Arguments
105    ///
106    /// - `header` - The `HeaderView` to use as the template.
107    pub fn from_template(header: &HeaderView) -> Self {
108        Header {
109            inner: unsafe { htslib::bcf_hdr_dup(header.inner) },
110            subset: None,
111        }
112    }
113
114    /// Create a new `Header` using the given `HeaderView` as as template, but subsetting to the
115    /// given `samples`.
116    ///
117    /// # Arguments
118    ///
119    /// - `header` - The `HeaderView` to use for the template.
120    /// - `samples` - A slice of byte-encoded (`[u8]`) sample names.
121    pub fn from_template_subset(header: &HeaderView, samples: &[&[u8]]) -> Result<Self> {
122        let mut imap = vec![0; samples.len()];
123        let names: Vec<_> = samples
124            .iter()
125            .map(|&s| ffi::CString::new(s).unwrap())
126            .collect();
127        let name_pointers: Vec<_> = names.iter().map(|s| s.as_ptr() as *mut i8).collect();
128        #[allow(clippy::unnecessary_cast)]
129        let name_pointers_ptr = name_pointers.as_ptr() as *const *mut c_char;
130        let inner = unsafe {
131            htslib::bcf_hdr_subset(
132                header.inner,
133                samples.len() as i32,
134                name_pointers_ptr,
135                imap.as_mut_ptr(),
136            )
137        };
138        if inner.is_null() {
139            Err(Error::BcfDuplicateSampleNames)
140        } else {
141            Ok(Header {
142                inner,
143                subset: Some(imap),
144            })
145        }
146    }
147
148    /// Add a `sample` to the header.
149    ///
150    /// # Arguments
151    ///
152    /// - `sample` - Name of the sample to add (to the end of the sample list).
153    pub fn push_sample(&mut self, sample: &[u8]) -> &mut Self {
154        let c_str = ffi::CString::new(sample).unwrap();
155        unsafe { htslib::bcf_hdr_add_sample(self.inner, c_str.as_ptr()) };
156        self
157    }
158
159    /// Add a record to the header.
160    ///
161    /// # Arguments
162    ///
163    /// - `record` - String representation of the header line
164    ///
165    /// # Example
166    ///
167    /// ```rust,ignore
168    /// header.push_record(format!("##contig=<ID={},length={}>", "chrX", 155270560).as_bytes());
169    /// ```
170    pub fn push_record(&mut self, record: &[u8]) -> &mut Self {
171        let c_str = ffi::CString::new(record).unwrap();
172        unsafe { htslib::bcf_hdr_append(self.inner, c_str.as_ptr()) };
173        self
174    }
175
176    /// Remove a `FILTER` entry from the header.
177    ///
178    /// # Arguments
179    ///
180    /// - `tag` - Name of the `FLT` tag to remove.
181    pub fn remove_filter(&mut self, tag: &[u8]) -> &mut Self {
182        self.remove_impl(tag, htslib::BCF_HL_FLT)
183    }
184
185    /// Remove an `INFO` entry from the header.
186    ///
187    /// # Arguments
188    ///
189    /// - `tag` - Name of the `INFO` tag to remove.
190    pub fn remove_info(&mut self, tag: &[u8]) -> &mut Self {
191        self.remove_impl(tag, htslib::BCF_HL_INFO)
192    }
193
194    /// Remove a `FORMAT` entry from the header.
195    ///
196    /// # Arguments
197    ///
198    /// - `tag` - Name of the `FORMAT` tag to remove.
199    pub fn remove_format(&mut self, tag: &[u8]) -> &mut Self {
200        self.remove_impl(tag, htslib::BCF_HL_FMT)
201    }
202
203    /// Remove a contig entry from the header.
204    ///
205    /// # Arguments
206    ///
207    /// - `tag` - Name of the `FORMAT` tag to remove.
208    pub fn remove_contig(&mut self, tag: &[u8]) -> &mut Self {
209        self.remove_impl(tag, htslib::BCF_HL_CTG)
210    }
211
212    /// Remove a structured entry from the header.
213    ///
214    /// # Arguments
215    ///
216    /// - `tag` - Name of the structured tag to remove.
217    pub fn remove_structured(&mut self, tag: &[u8]) -> &mut Self {
218        self.remove_impl(tag, htslib::BCF_HL_STR)
219    }
220
221    /// Remove a generic entry from the header.
222    ///
223    /// # Arguments
224    ///
225    /// - `tag` - Name of the generic tag to remove.
226    pub fn remove_generic(&mut self, tag: &[u8]) -> &mut Self {
227        self.remove_impl(tag, htslib::BCF_HL_GEN)
228    }
229
230    /// Implementation of removing header tags.
231    fn remove_impl(&mut self, tag: &[u8], type_: u32) -> &mut Self {
232        unsafe {
233            let v = tag.to_vec();
234            let c_str = ffi::CString::new(v).unwrap();
235            htslib::bcf_hdr_remove(self.inner, type_ as i32, c_str.as_ptr());
236        }
237        self
238    }
239}
240
241impl Drop for Header {
242    fn drop(&mut self) {
243        unsafe { htslib::bcf_hdr_destroy(self.inner) };
244    }
245}
246
247/// A header record.
248#[derive(Debug)]
249pub enum HeaderRecord {
250    /// A `FILTER` header record.
251    Filter {
252        key: String,
253        values: LinearMap<String, String>,
254    },
255    /// An `INFO` header record.
256    Info {
257        key: String,
258        values: LinearMap<String, String>,
259    },
260    /// A `FORMAT` header record.
261    Format {
262        key: String,
263        values: LinearMap<String, String>,
264    },
265    /// A `contig` header record.
266    Contig {
267        key: String,
268        values: LinearMap<String, String>,
269    },
270    /// A structured header record.
271    Structured {
272        key: String,
273        values: LinearMap<String, String>,
274    },
275    /// A generic, unstructured header record.
276    Generic { key: String, value: String },
277}
278
279#[derive(Debug)]
280pub struct HeaderView {
281    pub(crate) inner: *mut htslib::bcf_hdr_t,
282}
283
284unsafe impl Send for HeaderView {}
285unsafe impl Sync for HeaderView {}
286
287impl HeaderView {
288    /// Create a view from a raw pointer to a header.
289    ///
290    /// # Safety
291    /// The caller must ensure that the header is initialized.
292    pub unsafe fn from_ptr(inner: *mut htslib::bcf_hdr_t) -> Self {
293        HeaderView { inner }
294    }
295
296    /// Get a pointer to the underlying raw header.
297    ///
298    /// # Safety
299    /// The caller must ensure that the pointer is not used after this
300    /// `HeaderView` is dropped
301    pub unsafe fn as_ptr(&self) -> *mut htslib::bcf_hdr_t {
302        self.inner
303    }
304
305    #[inline]
306    fn inner(&self) -> htslib::bcf_hdr_t {
307        unsafe { *self.inner }
308    }
309
310    /// Get the number of samples defined in the header.
311    pub fn sample_count(&self) -> u32 {
312        self.inner().n[htslib::BCF_DT_SAMPLE as usize] as u32
313    }
314
315    /// Get vector of sample names defined in the header.
316    pub fn samples(&self) -> Vec<&[u8]> {
317        let names =
318            unsafe { slice::from_raw_parts(self.inner().samples, self.sample_count() as usize) };
319        names
320            .iter()
321            .map(|name| unsafe { ffi::CStr::from_ptr(*name).to_bytes() })
322            .collect()
323    }
324
325    /// Obtain id (column index) of given sample.
326    /// Returns `None` if sample is not present in header.
327    pub fn sample_id(&self, sample: &[u8]) -> Option<usize> {
328        self.samples().iter().position(|s| *s == sample)
329    }
330
331    /// Get the number of contigs defined in the header.
332    pub fn contig_count(&self) -> u32 {
333        self.inner().n[htslib::BCF_DT_CTG as usize] as u32
334    }
335
336    pub fn rid2name(&self, rid: u32) -> Result<&[u8]> {
337        if rid <= self.contig_count() {
338            unsafe {
339                let dict = self.inner().id[htslib::BCF_DT_CTG as usize];
340                let ptr = (*dict.offset(rid as isize)).key;
341                Ok(ffi::CStr::from_ptr(ptr).to_bytes())
342            }
343        } else {
344            Err(Error::BcfUnknownRID { rid })
345        }
346    }
347
348    /// Retrieve the (internal) chromosome identifier
349    /// # Examples
350    /// ```rust
351    /// use rust_htslib::bcf::header::Header;
352    /// use rust_htslib::bcf::{Format, Writer};
353    ///
354    /// let mut header = Header::new();
355    /// let contig_field = br#"##contig=<ID=foo,length=10>"#;
356    /// header.push_record(contig_field);
357    /// let mut vcf = Writer::from_stdout(&header, true, Format::Vcf).unwrap();
358    /// let header_view = vcf.header();
359    /// let rid = header_view.name2rid(b"foo").unwrap();
360    /// assert_eq!(rid, 0);
361    /// // try and retrieve a contig not in the header
362    /// let result = header_view.name2rid(b"bar");
363    /// assert!(result.is_err())
364    /// ```
365    /// # Errors
366    /// If `name` does not match a chromosome currently in the VCF header, returns [`Error::BcfUnknownContig`]
367    pub fn name2rid(&self, name: &[u8]) -> Result<u32> {
368        let c_str = ffi::CString::new(name).unwrap();
369        unsafe {
370            match htslib::bcf_hdr_id2int(
371                self.inner,
372                htslib::BCF_DT_CTG as i32,
373                c_str.as_ptr() as *mut c_char,
374            ) {
375                -1 => Err(Error::BcfUnknownContig {
376                    contig: str::from_utf8(name).unwrap().to_owned(),
377                }),
378                i => Ok(i as u32),
379            }
380        }
381    }
382
383    pub fn info_type(&self, tag: &[u8]) -> Result<(TagType, TagLength)> {
384        self.tag_type(tag, htslib::BCF_HL_INFO)
385    }
386
387    pub fn format_type(&self, tag: &[u8]) -> Result<(TagType, TagLength)> {
388        self.tag_type(tag, htslib::BCF_HL_FMT)
389    }
390
391    fn tag_type(&self, tag: &[u8], hdr_type: ::libc::c_uint) -> Result<(TagType, TagLength)> {
392        let tag_desc = || str::from_utf8(tag).unwrap().to_owned();
393        let c_str_tag = ffi::CString::new(tag).unwrap();
394        let (_type, length, num_values) = unsafe {
395            let id = htslib::bcf_hdr_id2int(
396                self.inner,
397                htslib::BCF_DT_ID as i32,
398                c_str_tag.as_ptr() as *mut c_char,
399            );
400            if id < 0 {
401                return Err(Error::BcfUndefinedTag { tag: tag_desc() });
402            }
403            let n = (*self.inner).n[htslib::BCF_DT_ID as usize] as usize;
404            let entry = slice::from_raw_parts((*self.inner).id[htslib::BCF_DT_ID as usize], n);
405            let d = (*entry[id as usize].val).info[hdr_type as usize];
406            ((d >> 4) & 0xf, (d >> 8) & 0xf, d >> 12)
407        };
408        let _type = match _type as ::libc::c_uint {
409            htslib::BCF_HT_FLAG => TagType::Flag,
410            htslib::BCF_HT_INT => TagType::Integer,
411            htslib::BCF_HT_REAL => TagType::Float,
412            htslib::BCF_HT_STR => TagType::String,
413            _ => return Err(Error::BcfUnexpectedType { tag: tag_desc() }),
414        };
415        let length = match length as ::libc::c_uint {
416            // XXX: Hacky "as u32" cast. Trace back through unsafe{} towards BCF struct and rollback to proper type
417            htslib::BCF_VL_FIXED => TagLength::Fixed(num_values as u32),
418            htslib::BCF_VL_VAR => TagLength::Variable,
419            htslib::BCF_VL_A => TagLength::AltAlleles,
420            htslib::BCF_VL_R => TagLength::Alleles,
421            htslib::BCF_VL_G => TagLength::Genotypes,
422            _ => return Err(Error::BcfUnexpectedType { tag: tag_desc() }),
423        };
424
425        Ok((_type, length))
426    }
427
428    /// Convert string ID (e.g., for a `FILTER` value) to its numeric identifier.
429    pub fn name_to_id(&self, id: &[u8]) -> Result<Id> {
430        let c_str = ffi::CString::new(id).unwrap();
431        unsafe {
432            match htslib::bcf_hdr_id2int(
433                self.inner,
434                htslib::BCF_DT_ID as i32,
435                c_str.as_ptr() as *const c_char,
436            ) {
437                -1 => Err(Error::BcfUnknownID {
438                    id: str::from_utf8(id).unwrap().to_owned(),
439                }),
440                i => Ok(Id(i as u32)),
441            }
442        }
443    }
444
445    /// Convert integer representing an identifier (e.g., a `FILTER` value) to its string
446    /// name.bam.
447    pub fn id_to_name(&self, id: Id) -> Vec<u8> {
448        let key = unsafe {
449            ffi::CStr::from_ptr(
450                (*(*self.inner).id[htslib::BCF_DT_ID as usize].offset(*id as isize)).key,
451            )
452        };
453        key.to_bytes().to_vec()
454    }
455
456    /// Convert string sample name to its numeric identifier.
457    pub fn sample_to_id(&self, id: &[u8]) -> Result<Id> {
458        let c_str = ffi::CString::new(id).unwrap();
459        unsafe {
460            match htslib::bcf_hdr_id2int(
461                self.inner,
462                htslib::BCF_DT_SAMPLE as i32,
463                c_str.as_ptr() as *const c_char,
464            ) {
465                -1 => Err(Error::BcfUnknownSample {
466                    name: str::from_utf8(id).unwrap().to_owned(),
467                }),
468                i => Ok(Id(i as u32)),
469            }
470        }
471    }
472
473    /// Convert integer representing an contig to its name.
474    pub fn id_to_sample(&self, id: Id) -> Vec<u8> {
475        let key = unsafe {
476            ffi::CStr::from_ptr(
477                (*(*self.inner).id[htslib::BCF_DT_SAMPLE as usize].offset(*id as isize)).key,
478            )
479        };
480        key.to_bytes().to_vec()
481    }
482
483    /// Return structured `HeaderRecord`s.
484    pub fn header_records(&self) -> Vec<HeaderRecord> {
485        fn parse_kv(rec: &htslib::bcf_hrec_t) -> LinearMap<String, String> {
486            let mut result: LinearMap<String, String> = LinearMap::new();
487            for i in 0_i32..(rec.nkeys) {
488                let key = unsafe {
489                    ffi::CStr::from_ptr(*rec.keys.offset(i as isize))
490                        .to_str()
491                        .unwrap()
492                        .to_string()
493                };
494                let value = unsafe {
495                    ffi::CStr::from_ptr(*rec.vals.offset(i as isize))
496                        .to_str()
497                        .unwrap()
498                        .to_string()
499                };
500                result.insert(key, value);
501            }
502            result
503        }
504
505        let mut result: Vec<HeaderRecord> = Vec::new();
506        for i in 0_i32..unsafe { (*self.inner).nhrec } {
507            let rec = unsafe { &(**(*self.inner).hrec.offset(i as isize)) };
508            let key = unsafe { ffi::CStr::from_ptr(rec.key).to_str().unwrap().to_string() };
509            let record = match rec.type_ {
510                0 => HeaderRecord::Filter {
511                    key,
512                    values: parse_kv(rec),
513                },
514                1 => HeaderRecord::Info {
515                    key,
516                    values: parse_kv(rec),
517                },
518                2 => HeaderRecord::Format {
519                    key,
520                    values: parse_kv(rec),
521                },
522                3 => HeaderRecord::Contig {
523                    key,
524                    values: parse_kv(rec),
525                },
526                4 => HeaderRecord::Structured {
527                    key,
528                    values: parse_kv(rec),
529                },
530                5 => HeaderRecord::Generic {
531                    key,
532                    value: unsafe { ffi::CStr::from_ptr(rec.value).to_str().unwrap().to_string() },
533                },
534                _ => panic!("Unknown type: {}", rec.type_),
535            };
536            result.push(record);
537        }
538        result
539    }
540
541    /// Create an empty record using this header view.
542    ///
543    /// The record can be reused multiple times.
544    pub fn empty_record(self: &Arc<Self>) -> crate::bcf::Record {
545        crate::bcf::Record::new(self.clone())
546    }
547}
548
549impl Clone for HeaderView {
550    fn clone(&self) -> Self {
551        HeaderView {
552            inner: unsafe { htslib::bcf_hdr_dup(self.inner) },
553        }
554    }
555}
556
557impl Drop for HeaderView {
558    fn drop(&mut self) {
559        unsafe {
560            htslib::bcf_hdr_destroy(self.inner);
561        }
562    }
563}
564
565#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
566pub enum TagType {
567    Flag,
568    Integer,
569    Float,
570    String,
571}
572
573#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
574pub enum TagLength {
575    Fixed(u32),
576    AltAlleles,
577    Alleles,
578    Genotypes,
579    Variable,
580}
581
582#[cfg(test)]
583mod tests {
584    use super::*;
585    use crate::bcf::Reader;
586    use crate::htslib;
587
588    #[test]
589    fn test_header_view_empty_record() {
590        // Open a VCF file to get a HeaderView
591        let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file");
592        let header_view = vcf.header.clone();
593
594        // Create an empty record from the HeaderView
595        let record = header_view.empty_record();
596        eprintln!("{:?}", record.rid());
597
598        // Verify the record is properly initialized with default/empty values
599        assert_eq!(record.rid(), Some(0)); // No chromosome/contig set
600        assert_eq!(record.pos(), 0); // No position set
601        assert_eq!(record.qual(), 0.0); // No quality score set
602    }
603
604    #[test]
605    fn test_header_add_sample_via_raw_pointer() {
606        let sample_name = b"test-sample";
607
608        let header = Header::new();
609        let sample = std::ffi::CString::new(sample_name).unwrap();
610
611        let view = unsafe {
612            let ptr = header.inner_ptr();
613            // to avoid double free, as we will wrap this later in a HeaderView
614            std::mem::forget(header);
615            htslib::bcf_hdr_add_sample(ptr, sample.as_ptr());
616            htslib::bcf_hdr_sync(ptr);
617            // When the HeaderView is dropped, the bcf_hdr is freed
618            HeaderView::from_ptr(ptr)
619        };
620
621        assert_eq!(view.samples(), vec![sample_name]);
622    }
623
624    #[test]
625    fn test_header_view_version_via_raw_pointer() {
626        let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file");
627        let hv = vcf.header.clone();
628
629        let version = unsafe {
630            // the header view will outlive this pointer
631            let ptr = hv.as_ptr();
632            let version_charptr = htslib::bcf_hdr_get_version(ptr);
633            std::ffi::CStr::from_ptr(version_charptr).to_str().unwrap()
634        };
635
636        assert_eq!(version, "VCFv4.1");
637    }
638}