Skip to main content

rsomics_vcf_index/
lib.rs

1use std::fs::File;
2use std::io::{self, BufRead};
3use std::path::Path;
4
5use noodles::bgzf;
6use noodles::core::Position;
7use noodles::csi::{
8    self as csi,
9    binning_index::index::{
10        ReferenceSequence,
11        header::{Builder as HeaderBuilder, ReferenceSequenceNames},
12        reference_sequence::bin::Chunk,
13    },
14    binning_index::{self, BinningIndex, index::reference_sequence::index::BinnedIndex},
15};
16use noodles::tabix;
17use noodles::vcf::{self, Header};
18
19/// Which on-disk format to write.
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum IndexKind {
22    /// Coordinate-sorted index (.csi). The `bcftools index` default.
23    Csi,
24    /// Tabix index (.tbi). tabix -p vcf preset.
25    Tbi,
26}
27
28/// CSI binning parameters htslib (and thus `bcftools index`) writes by default:
29/// 14-bit minimum interval (16 kbp leaves) over 6 binning levels.
30const CSI_MIN_SHIFT: u8 = 14;
31const CSI_DEPTH: u8 = 6;
32
33/// Build and write an index for a bgzipped VCF.
34///
35/// `src` is the `.vcf.gz` input; `dst` is the path to write the index.  Caller
36/// is responsible for not-overwriting checks if `--force` is absent.
37pub fn index_vcf(src: &Path, dst: &Path, kind: IndexKind) -> io::Result<()> {
38    match kind {
39        IndexKind::Csi => {
40            let idx = build_csi(src)?;
41            csi::fs::write(dst, &idx)
42        }
43        IndexKind::Tbi => {
44            let idx = build_tbi(src)?;
45            tabix::fs::write(dst, &idx)
46        }
47    }
48}
49
50// CSI: BinnedIndex with htslib's default min_shift/depth (bcftools index default).
51fn build_csi(src: &Path) -> io::Result<csi::Index> {
52    let (header, mut reader) = open(src)?;
53
54    // Collect contig names in declaration order so the CSI header carries the name→id map.
55    // bcftools / htslib resolve a region string like "chr1:1-100000" by looking up the contig
56    // name in the CSI aux block's reference_sequence_names list.
57    let ref_names: ReferenceSequenceNames = header
58        .contigs()
59        .keys()
60        .map(|k| bstr::BString::from(k.as_str()))
61        .collect();
62
63    let csi_header = HeaderBuilder::vcf()
64        .set_reference_sequence_names(ref_names)
65        .build();
66
67    // BinnedIndex is the CSI-native index sub-type (vs LinearIndex for tabix).
68    let mut indexer =
69        binning_index::Indexer::<BinnedIndex>::new(CSI_MIN_SHIFT, CSI_DEPTH).set_header(csi_header);
70
71    let contig_count = header.contigs().len();
72
73    // Per-reference linear index, mirroring htslib's `lidx`/`l.offset[]` (hts.c
74    // `insert_to_l`). noodles' BinnedIndex stores each bin's loffset from the
75    // record's own bin only; htslib derives every bin's loffset from this 16 kbp
76    // linear index, so a spanning structural-variant record (e.g. `<DEL>` with a
77    // far `END=`) lowers the loffset of every window it overlaps. Without that
78    // propagation a region query overlapping the SV's span but not its start would
79    // prune the SV's chunk on the `chunk.end <= loffset` test htslib applies.
80    let mut linear_indices: Vec<LinearIndexBuilder> = (0..contig_count)
81        .map(|_| LinearIndexBuilder::default())
82        .collect();
83
84    let mut line = Vec::new();
85    let mut start_pos = reader.virtual_position();
86
87    while read_line(&mut reader, &mut line)? != 0 {
88        let end_pos = reader.virtual_position();
89        let chunk = Chunk::new(start_pos, end_pos);
90
91        let (ref_name, start, end) = parse_interval(&line, &header)?;
92        let ref_id = header.contigs().get_index_of(ref_name).ok_or_else(|| {
93            io::Error::new(
94                io::ErrorKind::InvalidData,
95                format!("contig '{ref_name}' not declared in VCF header"),
96            )
97        })?;
98
99        linear_indices[ref_id].insert(start, end, start_pos);
100        indexer.add_record(Some((ref_id, start, end, true)), chunk)?;
101        start_pos = end_pos;
102    }
103
104    let index = indexer.build(contig_count);
105    Ok(apply_linear_loffsets(&index, &mut linear_indices))
106}
107
108/// htslib's linear index for one reference: a 16 kbp-window array of the earliest
109/// (smallest) record start voffset reaching each window (hts.c `lidx_t`).
110#[derive(Default)]
111struct LinearIndexBuilder {
112    offsets: Vec<Option<bgzf::VirtualPosition>>,
113}
114
115impl LinearIndexBuilder {
116    /// Mirror of htslib `insert_to_l`: fill windows `beg>>min_shift ..=
117    /// (end-1)>>min_shift` with `offset`, but only where still unset. Records
118    /// arrive in start order, so the first record to reach a window carries the
119    /// minimum start voffset of any record overlapping it.
120    fn insert(&mut self, start: Position, end: Position, offset: bgzf::VirtualPosition) {
121        let shift = usize::from(CSI_MIN_SHIFT);
122        let beg = (usize::from(start) - 1) >> shift;
123        let end = (usize::from(end) - 1) >> shift;
124
125        if self.offsets.len() < end + 1 {
126            self.offsets.resize(end + 1, None);
127        }
128
129        for window in &mut self.offsets[beg..=end] {
130            window.get_or_insert(offset);
131        }
132    }
133
134    /// Mirror of the backfill loop in htslib `update_loff`: walk right→left and
135    /// give every still-unset window the value of its right neighbour, so each
136    /// window holds the smallest start voffset of any record at-or-after it.
137    fn finish(&mut self) {
138        for i in (0..self.offsets.len().saturating_sub(1)).rev() {
139            if self.offsets[i].is_none() {
140                self.offsets[i] = self.offsets[i + 1];
141            }
142        }
143    }
144
145    /// The loffset htslib assigns a bin: `lidx[hts_bin_bot(bin)]`, i.e. the linear
146    /// index entry for the bin's lowest-coordinate covered window. Out-of-range
147    /// (no record reached that window) yields the default (0), matching htslib's
148    /// `bot_bin < lidx->n ? lidx->offset[bot_bin] : 0`.
149    fn loffset(&self, bin_id: usize) -> bgzf::VirtualPosition {
150        let bot = hts_bin_bot(bin_id, CSI_DEPTH);
151        self.offsets.get(bot).copied().flatten().unwrap_or_default()
152    }
153}
154
155/// First bin id on level `l` of the binning tree (hts.h `hts_bin_first`).
156fn hts_bin_first(level: u32) -> usize {
157    ((1usize << (3 * level)) - 1) / 7
158}
159
160/// Level of a bin in the binning tree (hts.h `hts_bin_level`): parent walks to 0.
161fn hts_bin_level(mut bin: usize) -> u32 {
162    let mut level = 0;
163    while bin != 0 {
164        bin = (bin - 1) >> 3;
165        level += 1;
166    }
167    level
168}
169
170/// Lowest-coordinate linear-index window covered by `bin` (hts.h `hts_bin_bot`):
171/// `(bin - hts_bin_first(level)) << ((n_lvls - level) * 3)`.
172fn hts_bin_bot(bin: usize, depth: u8) -> usize {
173    let level = hts_bin_level(bin);
174    let n_lvls = u32::from(depth);
175    (bin - hts_bin_first(level)) << ((n_lvls - level) * 3)
176}
177
178/// Replace every bin's loffset with htslib's linear-index-derived value
179/// (`update_loff` in hts.c), closing the spanning-SV gap in noodles' per-bin
180/// loffset. The bin chunk lists, metadata, and binning parameters built by the
181/// indexer are preserved; only the `BinnedIndex` loffset map of each reference is
182/// rebuilt. O(bins) over an already-collected linear index — no hot-path cost.
183fn apply_linear_loffsets(
184    index: &csi::Index,
185    linear_indices: &mut [LinearIndexBuilder],
186) -> csi::Index {
187    let min_shift = index.min_shift();
188    let depth = index.depth();
189    let header = index.header().cloned();
190    let unplaced = index.unplaced_unmapped_record_count();
191
192    let reference_sequences: Vec<ReferenceSequence<BinnedIndex>> = index
193        .reference_sequences()
194        .iter()
195        .zip(linear_indices.iter_mut())
196        .map(|(reference_sequence, linear_index)| {
197            linear_index.finish();
198
199            let bins = reference_sequence.bins().clone();
200            let loffsets: BinnedIndex = bins
201                .keys()
202                .map(|&bin_id| (bin_id, linear_index.loffset(bin_id)))
203                .collect();
204            let metadata = csi_metadata(reference_sequence).cloned();
205
206            ReferenceSequence::new(bins, loffsets, metadata)
207        })
208        .collect();
209
210    let mut builder = csi::Index::builder()
211        .set_min_shift(min_shift)
212        .set_depth(depth)
213        .set_reference_sequences(reference_sequences);
214
215    if let Some(header) = header {
216        builder = builder.set_header(header);
217    }
218    if let Some(count) = unplaced {
219        builder = builder.set_unplaced_unmapped_record_count(count);
220    }
221
222    builder.build()
223}
224
225/// Read a reference sequence's metadata pseudo-bin through the trait the CSI
226/// reference-sequence type implements.
227fn csi_metadata(
228    reference_sequence: &ReferenceSequence<BinnedIndex>,
229) -> Option<&csi::binning_index::index::reference_sequence::Metadata> {
230    use csi::binning_index::ReferenceSequence as _;
231    reference_sequence.metadata()
232}
233
234// TBI: LinearIndex, tabix VCF preset.
235fn build_tbi(src: &Path) -> io::Result<tabix::Index> {
236    let (header, mut reader) = open(src)?;
237
238    let mut indexer = tabix::index::Indexer::default();
239    indexer.set_header(HeaderBuilder::vcf().build());
240
241    let mut line = Vec::new();
242    let mut start_pos = reader.virtual_position();
243
244    while read_line(&mut reader, &mut line)? != 0 {
245        let end_pos = reader.virtual_position();
246        let chunk = Chunk::new(start_pos, end_pos);
247
248        let (ref_name, start, end) = parse_interval(&line, &header)?;
249        indexer.add_record(ref_name, start, end, chunk)?;
250        start_pos = end_pos;
251    }
252
253    Ok(indexer.build())
254}
255
256/// Read the header through noodles (for the contig map and file format), then
257/// hand back the underlying BGZF reader positioned at the first data record so
258/// the index loop can do a minimal CHROM/POS/REF parse off the raw lines.
259fn open(src: &Path) -> io::Result<(Header, bgzf::io::Reader<File>)> {
260    let mut reader = File::open(src)
261        .map(bgzf::io::Reader::new)
262        .map(vcf::io::Reader::new)?;
263    let header = reader.read_header()?;
264    Ok((header, reader.into_inner()))
265}
266
267/// Read one record line (without the trailing newline) into `dst`.  Mirrors the
268/// std `read_until` semantics noodles itself relies on, so `virtual_position()`
269/// after the call lands on the byte after the line feed — the chunk boundary.
270fn read_line<R>(reader: &mut R, dst: &mut Vec<u8>) -> io::Result<usize>
271where
272    R: BufRead,
273{
274    const LINE_FEED: u8 = b'\n';
275    const CARRIAGE_RETURN: u8 = b'\r';
276
277    dst.clear();
278    match reader.read_until(LINE_FEED, dst)? {
279        0 => Ok(0),
280        n => {
281            if dst.last() == Some(&LINE_FEED) {
282                dst.pop();
283                if dst.last() == Some(&CARRIAGE_RETURN) {
284                    dst.pop();
285                }
286            }
287            Ok(n)
288        }
289    }
290}
291
292/// Extract the CHROM, start, and end of a record's index interval from a raw
293/// VCF data line.
294///
295/// htslib indexes each record over the 1-based inclusive span from `POS` to
296/// `POS + rlen - 1`, where `rlen` is the REF allele length for the common
297/// SNV/indel case.  For VCF < 4.5 an INFO `END` value, when larger, extends the
298/// span (the reach htslib uses for symbolic structural-variant records).
299/// Parsing CHROM, POS, REF, and an `END=` scan of INFO avoids materialising the
300/// alternate alleles, FORMAT keys, and per-sample genotypes a full record decode
301/// would touch — none of which affect the index interval.
302fn parse_interval<'a>(
303    line: &'a [u8],
304    header: &Header,
305) -> io::Result<(&'a str, Position, Position)> {
306    let chrom_end = memchr::memchr(b'\t', line).ok_or_else(|| invalid("VCF record missing POS"))?;
307    let ref_name = std::str::from_utf8(&line[..chrom_end])
308        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
309
310    let after_chrom = &line[chrom_end + 1..];
311    let pos_end =
312        memchr::memchr(b'\t', after_chrom).ok_or_else(|| invalid("VCF record missing ID"))?;
313
314    // POS == 0 is the telomere-start sentinel htslib treats specially; noodles'
315    // own indexer rejects records with no valid 1-based start, so we do too.
316    let pos = parse_usize(&after_chrom[..pos_end])?;
317    let start =
318        Position::try_from(pos).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
319
320    // Skip ID, land on REF.
321    let after_pos = &after_chrom[pos_end + 1..];
322    let id_end =
323        memchr::memchr(b'\t', after_pos).ok_or_else(|| invalid("VCF record missing REF"))?;
324    let after_id = &after_pos[id_end + 1..];
325    let ref_len =
326        memchr::memchr(b'\t', after_id).ok_or_else(|| invalid("VCF record missing ALT"))?;
327    if ref_len == 0 {
328        return Err(invalid("invalid reference bases length"));
329    }
330
331    // span end = POS + rlen - 1; END (VCF < 4.5) extends it when larger.
332    let mut end = start
333        .checked_add(ref_len - 1)
334        .ok_or_else(|| invalid("position overflow"))?;
335    if let Some(end_pos) = info_end(after_id, header)? {
336        let end_position = Position::try_from(end_pos)
337            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
338        end = end.max(end_position);
339    }
340
341    Ok((ref_name, start, end))
342}
343
344/// For VCF < 4.5, htslib derives the record's reach from the INFO `END` field
345/// when present.  Returns the parsed 1-based inclusive end coordinate, or `None`
346/// when END is absent or the file format is >= 4.5 (where END is deprecated and
347/// the reach comes from REF/SVLEN instead).  Scans only the INFO column for an
348/// `END=` key — no full INFO parse.
349fn info_end(after_id: &[u8], header: &Header) -> io::Result<Option<usize>> {
350    let ff = header.file_format();
351    if (ff.major(), ff.minor()) >= (4, 5) {
352        return Ok(None);
353    }
354
355    // after_id starts at REF; INFO is the 4th tab field beyond it
356    // (REF, ALT, QUAL, FILTER, INFO).
357    let Some(info) = nth_tab_field(after_id, 4) else {
358        return Ok(None);
359    };
360    if info == b"." {
361        return Ok(None);
362    }
363
364    match find_info_value(info, b"END") {
365        Some(value) => Ok(Some(parse_usize(value)?)),
366        None => Ok(None),
367    }
368}
369
370/// Find the value of `key` inside a VCF INFO field (`;`-delimited `KEY=VALUE`
371/// pairs).  Returns the raw value bytes if the key is found.
372fn find_info_value<'a>(info: &'a [u8], key: &[u8]) -> Option<&'a [u8]> {
373    for entry in info.split(|&b| b == b';') {
374        if let Some(eq) = memchr::memchr(b'=', entry)
375            && &entry[..eq] == key
376        {
377            return Some(&entry[eq + 1..]);
378        }
379    }
380    None
381}
382
383/// Return the `n`-th tab-delimited field of `src` beyond the current position
384/// (0-based), stopping at the next tab or end of slice.
385fn nth_tab_field(src: &[u8], n: usize) -> Option<&[u8]> {
386    let mut rest = src;
387    for _ in 0..n {
388        let i = memchr::memchr(b'\t', rest)?;
389        rest = &rest[i + 1..];
390    }
391    let end = memchr::memchr(b'\t', rest).unwrap_or(rest.len());
392    Some(&rest[..end])
393}
394
395fn parse_usize(bytes: &[u8]) -> io::Result<usize> {
396    let s =
397        std::str::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
398    s.parse::<usize>()
399        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
400}
401
402fn invalid(msg: &'static str) -> io::Error {
403    io::Error::new(io::ErrorKind::InvalidData, msg)
404}