1use std::fs::File;
2use std::io::{self, BufRead};
3use std::path::Path;
4
5use noodles::bgzf;
6use noodles::core::Position;
7use noodles::csi::{
8 self as csi,
9 binning_index::index::{
10 ReferenceSequence,
11 header::{Builder as HeaderBuilder, ReferenceSequenceNames},
12 reference_sequence::bin::Chunk,
13 },
14 binning_index::{self, BinningIndex, index::reference_sequence::index::BinnedIndex},
15};
16use noodles::tabix;
17use noodles::vcf::{self, Header};
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum IndexKind {
22 Csi,
24 Tbi,
26}
27
28const CSI_MIN_SHIFT: u8 = 14;
31const CSI_DEPTH: u8 = 6;
32
33pub fn index_vcf(src: &Path, dst: &Path, kind: IndexKind) -> io::Result<()> {
38 match kind {
39 IndexKind::Csi => {
40 let idx = build_csi(src)?;
41 csi::fs::write(dst, &idx)
42 }
43 IndexKind::Tbi => {
44 let idx = build_tbi(src)?;
45 tabix::fs::write(dst, &idx)
46 }
47 }
48}
49
50fn build_csi(src: &Path) -> io::Result<csi::Index> {
52 let (header, mut reader) = open(src)?;
53
54 let ref_names: ReferenceSequenceNames = header
58 .contigs()
59 .keys()
60 .map(|k| bstr::BString::from(k.as_str()))
61 .collect();
62
63 let csi_header = HeaderBuilder::vcf()
64 .set_reference_sequence_names(ref_names)
65 .build();
66
67 let mut indexer =
69 binning_index::Indexer::<BinnedIndex>::new(CSI_MIN_SHIFT, CSI_DEPTH).set_header(csi_header);
70
71 let contig_count = header.contigs().len();
72
73 let mut linear_indices: Vec<LinearIndexBuilder> = (0..contig_count)
81 .map(|_| LinearIndexBuilder::default())
82 .collect();
83
84 let mut line = Vec::new();
85 let mut start_pos = reader.virtual_position();
86
87 while read_line(&mut reader, &mut line)? != 0 {
88 let end_pos = reader.virtual_position();
89 let chunk = Chunk::new(start_pos, end_pos);
90
91 let (ref_name, start, end) = parse_interval(&line, &header)?;
92 let ref_id = header.contigs().get_index_of(ref_name).ok_or_else(|| {
93 io::Error::new(
94 io::ErrorKind::InvalidData,
95 format!("contig '{ref_name}' not declared in VCF header"),
96 )
97 })?;
98
99 linear_indices[ref_id].insert(start, end, start_pos);
100 indexer.add_record(Some((ref_id, start, end, true)), chunk)?;
101 start_pos = end_pos;
102 }
103
104 let index = indexer.build(contig_count);
105 Ok(apply_linear_loffsets(&index, &mut linear_indices))
106}
107
108#[derive(Default)]
111struct LinearIndexBuilder {
112 offsets: Vec<Option<bgzf::VirtualPosition>>,
113}
114
115impl LinearIndexBuilder {
116 fn insert(&mut self, start: Position, end: Position, offset: bgzf::VirtualPosition) {
121 let shift = usize::from(CSI_MIN_SHIFT);
122 let beg = (usize::from(start) - 1) >> shift;
123 let end = (usize::from(end) - 1) >> shift;
124
125 if self.offsets.len() < end + 1 {
126 self.offsets.resize(end + 1, None);
127 }
128
129 for window in &mut self.offsets[beg..=end] {
130 window.get_or_insert(offset);
131 }
132 }
133
134 fn finish(&mut self) {
138 for i in (0..self.offsets.len().saturating_sub(1)).rev() {
139 if self.offsets[i].is_none() {
140 self.offsets[i] = self.offsets[i + 1];
141 }
142 }
143 }
144
145 fn loffset(&self, bin_id: usize) -> bgzf::VirtualPosition {
150 let bot = hts_bin_bot(bin_id, CSI_DEPTH);
151 self.offsets.get(bot).copied().flatten().unwrap_or_default()
152 }
153}
154
155fn hts_bin_first(level: u32) -> usize {
157 ((1usize << (3 * level)) - 1) / 7
158}
159
160fn hts_bin_level(mut bin: usize) -> u32 {
162 let mut level = 0;
163 while bin != 0 {
164 bin = (bin - 1) >> 3;
165 level += 1;
166 }
167 level
168}
169
170fn hts_bin_bot(bin: usize, depth: u8) -> usize {
173 let level = hts_bin_level(bin);
174 let n_lvls = u32::from(depth);
175 (bin - hts_bin_first(level)) << ((n_lvls - level) * 3)
176}
177
178fn apply_linear_loffsets(
184 index: &csi::Index,
185 linear_indices: &mut [LinearIndexBuilder],
186) -> csi::Index {
187 let min_shift = index.min_shift();
188 let depth = index.depth();
189 let header = index.header().cloned();
190 let unplaced = index.unplaced_unmapped_record_count();
191
192 let reference_sequences: Vec<ReferenceSequence<BinnedIndex>> = index
193 .reference_sequences()
194 .iter()
195 .zip(linear_indices.iter_mut())
196 .map(|(reference_sequence, linear_index)| {
197 linear_index.finish();
198
199 let bins = reference_sequence.bins().clone();
200 let loffsets: BinnedIndex = bins
201 .keys()
202 .map(|&bin_id| (bin_id, linear_index.loffset(bin_id)))
203 .collect();
204 let metadata = csi_metadata(reference_sequence).cloned();
205
206 ReferenceSequence::new(bins, loffsets, metadata)
207 })
208 .collect();
209
210 let mut builder = csi::Index::builder()
211 .set_min_shift(min_shift)
212 .set_depth(depth)
213 .set_reference_sequences(reference_sequences);
214
215 if let Some(header) = header {
216 builder = builder.set_header(header);
217 }
218 if let Some(count) = unplaced {
219 builder = builder.set_unplaced_unmapped_record_count(count);
220 }
221
222 builder.build()
223}
224
225fn csi_metadata(
228 reference_sequence: &ReferenceSequence<BinnedIndex>,
229) -> Option<&csi::binning_index::index::reference_sequence::Metadata> {
230 use csi::binning_index::ReferenceSequence as _;
231 reference_sequence.metadata()
232}
233
234fn build_tbi(src: &Path) -> io::Result<tabix::Index> {
236 let (header, mut reader) = open(src)?;
237
238 let mut indexer = tabix::index::Indexer::default();
239 indexer.set_header(HeaderBuilder::vcf().build());
240
241 let mut line = Vec::new();
242 let mut start_pos = reader.virtual_position();
243
244 while read_line(&mut reader, &mut line)? != 0 {
245 let end_pos = reader.virtual_position();
246 let chunk = Chunk::new(start_pos, end_pos);
247
248 let (ref_name, start, end) = parse_interval(&line, &header)?;
249 indexer.add_record(ref_name, start, end, chunk)?;
250 start_pos = end_pos;
251 }
252
253 Ok(indexer.build())
254}
255
256fn open(src: &Path) -> io::Result<(Header, bgzf::io::Reader<File>)> {
260 let mut reader = File::open(src)
261 .map(bgzf::io::Reader::new)
262 .map(vcf::io::Reader::new)?;
263 let header = reader.read_header()?;
264 Ok((header, reader.into_inner()))
265}
266
267fn read_line<R>(reader: &mut R, dst: &mut Vec<u8>) -> io::Result<usize>
271where
272 R: BufRead,
273{
274 const LINE_FEED: u8 = b'\n';
275 const CARRIAGE_RETURN: u8 = b'\r';
276
277 dst.clear();
278 match reader.read_until(LINE_FEED, dst)? {
279 0 => Ok(0),
280 n => {
281 if dst.last() == Some(&LINE_FEED) {
282 dst.pop();
283 if dst.last() == Some(&CARRIAGE_RETURN) {
284 dst.pop();
285 }
286 }
287 Ok(n)
288 }
289 }
290}
291
292fn parse_interval<'a>(
303 line: &'a [u8],
304 header: &Header,
305) -> io::Result<(&'a str, Position, Position)> {
306 let chrom_end = memchr::memchr(b'\t', line).ok_or_else(|| invalid("VCF record missing POS"))?;
307 let ref_name = std::str::from_utf8(&line[..chrom_end])
308 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
309
310 let after_chrom = &line[chrom_end + 1..];
311 let pos_end =
312 memchr::memchr(b'\t', after_chrom).ok_or_else(|| invalid("VCF record missing ID"))?;
313
314 let pos = parse_usize(&after_chrom[..pos_end])?;
317 let start =
318 Position::try_from(pos).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
319
320 let after_pos = &after_chrom[pos_end + 1..];
322 let id_end =
323 memchr::memchr(b'\t', after_pos).ok_or_else(|| invalid("VCF record missing REF"))?;
324 let after_id = &after_pos[id_end + 1..];
325 let ref_len =
326 memchr::memchr(b'\t', after_id).ok_or_else(|| invalid("VCF record missing ALT"))?;
327 if ref_len == 0 {
328 return Err(invalid("invalid reference bases length"));
329 }
330
331 let mut end = start
333 .checked_add(ref_len - 1)
334 .ok_or_else(|| invalid("position overflow"))?;
335 if let Some(end_pos) = info_end(after_id, header)? {
336 let end_position = Position::try_from(end_pos)
337 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
338 end = end.max(end_position);
339 }
340
341 Ok((ref_name, start, end))
342}
343
344fn info_end(after_id: &[u8], header: &Header) -> io::Result<Option<usize>> {
350 let ff = header.file_format();
351 if (ff.major(), ff.minor()) >= (4, 5) {
352 return Ok(None);
353 }
354
355 let Some(info) = nth_tab_field(after_id, 4) else {
358 return Ok(None);
359 };
360 if info == b"." {
361 return Ok(None);
362 }
363
364 match find_info_value(info, b"END") {
365 Some(value) => Ok(Some(parse_usize(value)?)),
366 None => Ok(None),
367 }
368}
369
370fn find_info_value<'a>(info: &'a [u8], key: &[u8]) -> Option<&'a [u8]> {
373 for entry in info.split(|&b| b == b';') {
374 if let Some(eq) = memchr::memchr(b'=', entry)
375 && &entry[..eq] == key
376 {
377 return Some(&entry[eq + 1..]);
378 }
379 }
380 None
381}
382
383fn nth_tab_field(src: &[u8], n: usize) -> Option<&[u8]> {
386 let mut rest = src;
387 for _ in 0..n {
388 let i = memchr::memchr(b'\t', rest)?;
389 rest = &rest[i + 1..];
390 }
391 let end = memchr::memchr(b'\t', rest).unwrap_or(rest.len());
392 Some(&rest[..end])
393}
394
395fn parse_usize(bytes: &[u8]) -> io::Result<usize> {
396 let s =
397 std::str::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
398 s.parse::<usize>()
399 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
400}
401
402fn invalid(msg: &'static str) -> io::Error {
403 io::Error::new(io::ErrorKind::InvalidData, msg)
404}