rust_htslib/bcf/
header.rs

1// Copyright 2014 Johannes Köster.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5//! Module for working with VCF or BCF headers.
6//!
7//! # Examples
8//! From the header of a VCF file we can
9//!   - Output sample count of a VCF file
10//!   - Output sample names of a VCF file
11//!   - Output sample index given a sample name of a VCF file.
12//! ```
13//! use crate::rust_htslib::bcf::{Reader, Read};
14//! use std::io::Read as IoRead;
15//!
16//! let path = &"test/test_string.vcf";
17//! let mut bcf = Reader::from_path(path).expect("Error opening file.");
18//! let header = bcf.header();
19//! assert_eq!(header.sample_count(), 2);  // Sample count
20//! let mut s = String::new();
21//! for (i, mut x) in header.samples().into_iter().enumerate() {
22//!     x.read_to_string(&mut s);  // Read sample name in to `s`
23//!     println!("{}", s);  // output sample name
24//! }
25//! assert_eq!(header.sample_id(b"one").unwrap(), 0);  // Sample index wrapped in Option<usize>
26//! assert_eq!(header.sample_id(b"two").unwrap(), 1);  // Sample index wrapped in Option<usize>
27//! assert!(header.sample_id(b"non existent sample").is_none());  // Return none if not found
28//!
29//! assert_eq!(header.contig_count(), 1); // Number of contig in header.
30//! // obtain the data type of an INFO field
31//! let (tag_type, tag_length) = header.info_type(b"S1").unwrap();
32//! let (fmt_type, fmt_length) = header.format_type(b"GT").unwrap();
33//! ```
34
35use std::ffi;
36use std::os::raw::c_char;
37use std::rc::Rc;
38use std::slice;
39use std::str;
40
41use crate::htslib;
42
43use linear_map::LinearMap;
44
45use crate::errors::{Error, Result};
46
47pub type SampleSubset = Vec<i32>;
48
49custom_derive! {
50    /// A newtype for IDs from BCF headers.
51    #[derive(
52        NewtypeFrom,
53        NewtypeDeref,
54        PartialEq,
55        PartialOrd,
56        Eq,
57        Ord,
58        Copy,
59        Clone,
60        Debug
61    )]
62    pub struct Id(pub u32);
63}
64
65/// A BCF header.
66#[derive(Debug)]
67pub struct Header {
68    pub inner: *mut htslib::bcf_hdr_t,
69    pub subset: Option<SampleSubset>,
70}
71
72impl Default for Header {
73    fn default() -> Self {
74        Self::new()
75    }
76}
77
78impl Header {
79    /// Create a new (empty) `Header`.
80    pub fn new() -> Self {
81        let c_str = ffi::CString::new(&b"w"[..]).unwrap();
82        Header {
83            inner: unsafe { htslib::bcf_hdr_init(c_str.as_ptr()) },
84            subset: None,
85        }
86    }
87
88    /// Create a new `Header` using the given `HeaderView` as the template.
89    ///
90    /// After construction, you can modify the header independently from the template `header`.
91    ///
92    /// # Arguments
93    ///
94    /// - `header` - The `HeaderView` to use as the template.
95    pub fn from_template(header: &HeaderView) -> Self {
96        Header {
97            inner: unsafe { htslib::bcf_hdr_dup(header.inner) },
98            subset: None,
99        }
100    }
101
102    /// Create a new `Header` using the given `HeaderView` as as template, but subsetting to the
103    /// given `samples`.
104    ///
105    /// # Arguments
106    ///
107    /// - `header` - The `HeaderView` to use for the template.
108    /// - `samples` - A slice of byte-encoded (`[u8]`) sample names.
109    pub fn from_template_subset(header: &HeaderView, samples: &[&[u8]]) -> Result<Self> {
110        let mut imap = vec![0; samples.len()];
111        let names: Vec<_> = samples
112            .iter()
113            .map(|&s| ffi::CString::new(s).unwrap())
114            .collect();
115        let name_pointers: Vec<_> = names.iter().map(|s| s.as_ptr() as *mut i8).collect();
116        #[allow(clippy::unnecessary_cast)]
117        let name_pointers_ptr = name_pointers.as_ptr() as *const *mut c_char;
118        let inner = unsafe {
119            htslib::bcf_hdr_subset(
120                header.inner,
121                samples.len() as i32,
122                name_pointers_ptr,
123                imap.as_mut_ptr(),
124            )
125        };
126        if inner.is_null() {
127            Err(Error::BcfDuplicateSampleNames)
128        } else {
129            Ok(Header {
130                inner,
131                subset: Some(imap),
132            })
133        }
134    }
135
136    /// Add a `sample` to the header.
137    ///
138    /// # Arguments
139    ///
140    /// - `sample` - Name of the sample to add (to the end of the sample list).
141    pub fn push_sample(&mut self, sample: &[u8]) -> &mut Self {
142        let c_str = ffi::CString::new(sample).unwrap();
143        unsafe { htslib::bcf_hdr_add_sample(self.inner, c_str.as_ptr()) };
144        self
145    }
146
147    /// Add a record to the header.
148    ///
149    /// # Arguments
150    ///
151    /// - `record` - String representation of the header line
152    ///
153    /// # Example
154    ///
155    /// ```rust,ignore
156    /// header.push_record(format!("##contig=<ID={},length={}>", "chrX", 155270560).as_bytes());
157    /// ```
158    pub fn push_record(&mut self, record: &[u8]) -> &mut Self {
159        let c_str = ffi::CString::new(record).unwrap();
160        unsafe { htslib::bcf_hdr_append(self.inner, c_str.as_ptr()) };
161        self
162    }
163
164    /// Remove a `FILTER` entry from the header.
165    ///
166    /// # Arguments
167    ///
168    /// - `tag` - Name of the `FLT` tag to remove.
169    pub fn remove_filter(&mut self, tag: &[u8]) -> &mut Self {
170        self.remove_impl(tag, htslib::BCF_HL_FLT)
171    }
172
173    /// Remove an `INFO` entry from the header.
174    ///
175    /// # Arguments
176    ///
177    /// - `tag` - Name of the `INFO` tag to remove.
178    pub fn remove_info(&mut self, tag: &[u8]) -> &mut Self {
179        self.remove_impl(tag, htslib::BCF_HL_INFO)
180    }
181
182    /// Remove a `FORMAT` entry from the header.
183    ///
184    /// # Arguments
185    ///
186    /// - `tag` - Name of the `FORMAT` tag to remove.
187    pub fn remove_format(&mut self, tag: &[u8]) -> &mut Self {
188        self.remove_impl(tag, htslib::BCF_HL_FMT)
189    }
190
191    /// Remove a contig entry from the header.
192    ///
193    /// # Arguments
194    ///
195    /// - `tag` - Name of the `FORMAT` tag to remove.
196    pub fn remove_contig(&mut self, tag: &[u8]) -> &mut Self {
197        self.remove_impl(tag, htslib::BCF_HL_CTG)
198    }
199
200    /// Remove a structured entry from the header.
201    ///
202    /// # Arguments
203    ///
204    /// - `tag` - Name of the structured tag to remove.
205    pub fn remove_structured(&mut self, tag: &[u8]) -> &mut Self {
206        self.remove_impl(tag, htslib::BCF_HL_STR)
207    }
208
209    /// Remove a generic entry from the header.
210    ///
211    /// # Arguments
212    ///
213    /// - `tag` - Name of the generic tag to remove.
214    pub fn remove_generic(&mut self, tag: &[u8]) -> &mut Self {
215        self.remove_impl(tag, htslib::BCF_HL_GEN)
216    }
217
218    /// Implementation of removing header tags.
219    fn remove_impl(&mut self, tag: &[u8], type_: u32) -> &mut Self {
220        unsafe {
221            let v = tag.to_vec();
222            let c_str = ffi::CString::new(v).unwrap();
223            htslib::bcf_hdr_remove(self.inner, type_ as i32, c_str.as_ptr());
224        }
225        self
226    }
227}
228
229impl Drop for Header {
230    fn drop(&mut self) {
231        unsafe { htslib::bcf_hdr_destroy(self.inner) };
232    }
233}
234
235/// A header record.
236#[derive(Debug)]
237pub enum HeaderRecord {
238    /// A `FILTER` header record.
239    Filter {
240        key: String,
241        values: LinearMap<String, String>,
242    },
243    /// An `INFO` header record.
244    Info {
245        key: String,
246        values: LinearMap<String, String>,
247    },
248    /// A `FORMAT` header record.
249    Format {
250        key: String,
251        values: LinearMap<String, String>,
252    },
253    /// A `contig` header record.
254    Contig {
255        key: String,
256        values: LinearMap<String, String>,
257    },
258    /// A structured header record.
259    Structured {
260        key: String,
261        values: LinearMap<String, String>,
262    },
263    /// A generic, unstructured header record.
264    Generic { key: String, value: String },
265}
266
267#[derive(Debug)]
268pub struct HeaderView {
269    pub inner: *mut htslib::bcf_hdr_t,
270}
271
272impl HeaderView {
273    pub fn new(inner: *mut htslib::bcf_hdr_t) -> Self {
274        HeaderView { inner }
275    }
276
277    #[inline]
278    fn inner(&self) -> htslib::bcf_hdr_t {
279        unsafe { *self.inner }
280    }
281
282    /// Get the number of samples defined in the header.
283    pub fn sample_count(&self) -> u32 {
284        self.inner().n[htslib::BCF_DT_SAMPLE as usize] as u32
285    }
286
287    /// Get vector of sample names defined in the header.
288    pub fn samples(&self) -> Vec<&[u8]> {
289        let names =
290            unsafe { slice::from_raw_parts(self.inner().samples, self.sample_count() as usize) };
291        names
292            .iter()
293            .map(|name| unsafe { ffi::CStr::from_ptr(*name).to_bytes() })
294            .collect()
295    }
296
297    /// Obtain id (column index) of given sample.
298    /// Returns `None` if sample is not present in header.
299    pub fn sample_id(&self, sample: &[u8]) -> Option<usize> {
300        self.samples().iter().position(|s| *s == sample)
301    }
302
303    /// Get the number of contigs defined in the header.
304    pub fn contig_count(&self) -> u32 {
305        self.inner().n[htslib::BCF_DT_CTG as usize] as u32
306    }
307
308    pub fn rid2name(&self, rid: u32) -> Result<&[u8]> {
309        if rid <= self.contig_count() {
310            unsafe {
311                let dict = self.inner().id[htslib::BCF_DT_CTG as usize];
312                let ptr = (*dict.offset(rid as isize)).key;
313                Ok(ffi::CStr::from_ptr(ptr).to_bytes())
314            }
315        } else {
316            Err(Error::BcfUnknownRID { rid })
317        }
318    }
319
320    /// Retrieve the (internal) chromosome identifier
321    /// # Examples
322    /// ```rust
323    /// use rust_htslib::bcf::header::Header;
324    /// use rust_htslib::bcf::{Format, Writer};
325    ///
326    /// let mut header = Header::new();
327    /// let contig_field = br#"##contig=<ID=foo,length=10>"#;
328    /// header.push_record(contig_field);
329    /// let mut vcf = Writer::from_stdout(&header, true, Format::Vcf).unwrap();
330    /// let header_view = vcf.header();
331    /// let rid = header_view.name2rid(b"foo").unwrap();
332    /// assert_eq!(rid, 0);
333    /// // try and retrieve a contig not in the header
334    /// let result = header_view.name2rid(b"bar");
335    /// assert!(result.is_err())
336    /// ```
337    /// # Errors
338    /// If `name` does not match a chromosome currently in the VCF header, returns [`Error::BcfUnknownContig`]
339    pub fn name2rid(&self, name: &[u8]) -> Result<u32> {
340        let c_str = ffi::CString::new(name).unwrap();
341        unsafe {
342            match htslib::bcf_hdr_id2int(
343                self.inner,
344                htslib::BCF_DT_CTG as i32,
345                c_str.as_ptr() as *mut c_char,
346            ) {
347                -1 => Err(Error::BcfUnknownContig {
348                    contig: str::from_utf8(name).unwrap().to_owned(),
349                }),
350                i => Ok(i as u32),
351            }
352        }
353    }
354
355    pub fn info_type(&self, tag: &[u8]) -> Result<(TagType, TagLength)> {
356        self.tag_type(tag, htslib::BCF_HL_INFO)
357    }
358
359    pub fn format_type(&self, tag: &[u8]) -> Result<(TagType, TagLength)> {
360        self.tag_type(tag, htslib::BCF_HL_FMT)
361    }
362
363    fn tag_type(&self, tag: &[u8], hdr_type: ::libc::c_uint) -> Result<(TagType, TagLength)> {
364        let tag_desc = || str::from_utf8(tag).unwrap().to_owned();
365        let c_str_tag = ffi::CString::new(tag).unwrap();
366        let (_type, length, num_values) = unsafe {
367            let id = htslib::bcf_hdr_id2int(
368                self.inner,
369                htslib::BCF_DT_ID as i32,
370                c_str_tag.as_ptr() as *mut c_char,
371            );
372            if id < 0 {
373                return Err(Error::BcfUndefinedTag { tag: tag_desc() });
374            }
375            let n = (*self.inner).n[htslib::BCF_DT_ID as usize] as usize;
376            let entry = slice::from_raw_parts((*self.inner).id[htslib::BCF_DT_ID as usize], n);
377            let d = (*entry[id as usize].val).info[hdr_type as usize];
378            ((d >> 4) & 0xf, (d >> 8) & 0xf, d >> 12)
379        };
380        let _type = match _type as ::libc::c_uint {
381            htslib::BCF_HT_FLAG => TagType::Flag,
382            htslib::BCF_HT_INT => TagType::Integer,
383            htslib::BCF_HT_REAL => TagType::Float,
384            htslib::BCF_HT_STR => TagType::String,
385            _ => return Err(Error::BcfUnexpectedType { tag: tag_desc() }),
386        };
387        let length = match length as ::libc::c_uint {
388            // XXX: Hacky "as u32" cast. Trace back through unsafe{} towards BCF struct and rollback to proper type
389            htslib::BCF_VL_FIXED => TagLength::Fixed(num_values as u32),
390            htslib::BCF_VL_VAR => TagLength::Variable,
391            htslib::BCF_VL_A => TagLength::AltAlleles,
392            htslib::BCF_VL_R => TagLength::Alleles,
393            htslib::BCF_VL_G => TagLength::Genotypes,
394            _ => return Err(Error::BcfUnexpectedType { tag: tag_desc() }),
395        };
396
397        Ok((_type, length))
398    }
399
400    /// Convert string ID (e.g., for a `FILTER` value) to its numeric identifier.
401    pub fn name_to_id(&self, id: &[u8]) -> Result<Id> {
402        let c_str = ffi::CString::new(id).unwrap();
403        unsafe {
404            match htslib::bcf_hdr_id2int(
405                self.inner,
406                htslib::BCF_DT_ID as i32,
407                c_str.as_ptr() as *const c_char,
408            ) {
409                -1 => Err(Error::BcfUnknownID {
410                    id: str::from_utf8(id).unwrap().to_owned(),
411                }),
412                i => Ok(Id(i as u32)),
413            }
414        }
415    }
416
417    /// Convert integer representing an identifier (e.g., a `FILTER` value) to its string
418    /// name.bam.
419    pub fn id_to_name(&self, id: Id) -> Vec<u8> {
420        let key = unsafe {
421            ffi::CStr::from_ptr(
422                (*(*self.inner).id[htslib::BCF_DT_ID as usize].offset(*id as isize)).key,
423            )
424        };
425        key.to_bytes().to_vec()
426    }
427
428    /// Convert string sample name to its numeric identifier.
429    pub fn sample_to_id(&self, id: &[u8]) -> Result<Id> {
430        let c_str = ffi::CString::new(id).unwrap();
431        unsafe {
432            match htslib::bcf_hdr_id2int(
433                self.inner,
434                htslib::BCF_DT_SAMPLE as i32,
435                c_str.as_ptr() as *const c_char,
436            ) {
437                -1 => Err(Error::BcfUnknownSample {
438                    name: str::from_utf8(id).unwrap().to_owned(),
439                }),
440                i => Ok(Id(i as u32)),
441            }
442        }
443    }
444
445    /// Convert integer representing an contig to its name.
446    pub fn id_to_sample(&self, id: Id) -> Vec<u8> {
447        let key = unsafe {
448            ffi::CStr::from_ptr(
449                (*(*self.inner).id[htslib::BCF_DT_SAMPLE as usize].offset(*id as isize)).key,
450            )
451        };
452        key.to_bytes().to_vec()
453    }
454
455    /// Return structured `HeaderRecord`s.
456    pub fn header_records(&self) -> Vec<HeaderRecord> {
457        fn parse_kv(rec: &htslib::bcf_hrec_t) -> LinearMap<String, String> {
458            let mut result: LinearMap<String, String> = LinearMap::new();
459            for i in 0_i32..(rec.nkeys) {
460                let key = unsafe {
461                    ffi::CStr::from_ptr(*rec.keys.offset(i as isize))
462                        .to_str()
463                        .unwrap()
464                        .to_string()
465                };
466                let value = unsafe {
467                    ffi::CStr::from_ptr(*rec.vals.offset(i as isize))
468                        .to_str()
469                        .unwrap()
470                        .to_string()
471                };
472                result.insert(key, value);
473            }
474            result
475        }
476
477        let mut result: Vec<HeaderRecord> = Vec::new();
478        for i in 0_i32..unsafe { (*self.inner).nhrec } {
479            let rec = unsafe { &(**(*self.inner).hrec.offset(i as isize)) };
480            let key = unsafe { ffi::CStr::from_ptr(rec.key).to_str().unwrap().to_string() };
481            let record = match rec.type_ {
482                0 => HeaderRecord::Filter {
483                    key,
484                    values: parse_kv(rec),
485                },
486                1 => HeaderRecord::Info {
487                    key,
488                    values: parse_kv(rec),
489                },
490                2 => HeaderRecord::Format {
491                    key,
492                    values: parse_kv(rec),
493                },
494                3 => HeaderRecord::Contig {
495                    key,
496                    values: parse_kv(rec),
497                },
498                4 => HeaderRecord::Structured {
499                    key,
500                    values: parse_kv(rec),
501                },
502                5 => HeaderRecord::Generic {
503                    key,
504                    value: unsafe { ffi::CStr::from_ptr(rec.value).to_str().unwrap().to_string() },
505                },
506                _ => panic!("Unknown type: {}", rec.type_),
507            };
508            result.push(record);
509        }
510        result
511    }
512
513    /// Create an empty record using this header view.
514    ///
515    /// The record can be reused multiple times.
516    pub fn empty_record(&self) -> crate::bcf::Record {
517        crate::bcf::Record::new(Rc::new(self.clone()))
518    }
519}
520
521impl Clone for HeaderView {
522    fn clone(&self) -> Self {
523        HeaderView {
524            inner: unsafe { htslib::bcf_hdr_dup(self.inner) },
525        }
526    }
527}
528
529impl Drop for HeaderView {
530    fn drop(&mut self) {
531        unsafe {
532            htslib::bcf_hdr_destroy(self.inner);
533        }
534    }
535}
536
537#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
538pub enum TagType {
539    Flag,
540    Integer,
541    Float,
542    String,
543}
544
545#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
546pub enum TagLength {
547    Fixed(u32),
548    AltAlleles,
549    Alleles,
550    Genotypes,
551    Variable,
552}
553
554#[cfg(test)]
555mod tests {
556    use crate::bcf::Reader;
557
558    #[test]
559    fn test_header_view_empty_record() {
560        // Open a VCF file to get a HeaderView
561        let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file");
562        let header_view = vcf.header.clone();
563
564        // Create an empty record from the HeaderView
565        let record = header_view.empty_record();
566        eprintln!("{:?}", record.rid());
567
568        // Verify the record is properly initialized with default/empty values
569        assert_eq!(record.rid(), Some(0)); // No chromosome/contig set
570        assert_eq!(record.pos(), 0); // No position set
571        assert_eq!(record.qual(), 0.0); // No quality score set
572    }
573}