Skip to main content

rsomics_tabix/
config.rs

1use std::io;
2use std::str::FromStr;
3
4use noodles::csi::binning_index::index::header::format::{CoordinateSystem, Format};
5
6use crate::{Interval, invalid, parse_coord};
7
8/// Which on-disk index format to write.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum IndexKind {
11    /// Tabix linear index (.tbi). The htslib `tabix` default.
12    Tbi,
13    /// Coordinate-sorted index (.csi).
14    Csi,
15}
16
17/// htslib's named presets (`-p`). Each fixes the column layout, comment
18/// character, and coordinate base; `Generic` carries explicit columns set with
19/// `-s/-b/-e`.
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum Preset {
22    Gff,
23    Bed,
24    Sam,
25    Vcf,
26}
27
28impl FromStr for Preset {
29    type Err = String;
30
31    fn from_str(s: &str) -> Result<Self, Self::Err> {
32        match s {
33            "gff" => Ok(Self::Gff),
34            "bed" => Ok(Self::Bed),
35            "sam" => Ok(Self::Sam),
36            "vcf" => Ok(Self::Vcf),
37            other => Err(format!(
38                "unknown preset '{other}' (expected gff, bed, sam, or vcf)"
39            )),
40        }
41    }
42}
43
44/// htslib `tbx_conf_t`: the column layout and parsing rules used to derive each
45/// line's index interval. The four named presets are the upstream constants
46/// (`tbx_conf_gff/bed/sam/vcf` in tbx.c); a custom layout via `-s/-b/-e` is the
47/// `TBX_GENERIC` path.
48#[derive(Debug, Clone, Copy)]
49pub struct Config {
50    /// 1-based sequence-name column (`sc`).
51    pub seq_col: usize,
52    /// 1-based begin-coordinate column (`bc`).
53    pub begin_col: usize,
54    /// 1-based end-coordinate column (`ec`); 0 means "no end column".
55    pub end_col: usize,
56    /// Comment-line marker (`meta_char`).
57    pub comment: u8,
58    /// Leading lines skipped unconditionally (`line_skip`).
59    pub skip: u32,
60    /// Coordinates are already 0-based half-open (`TBX_UCSC`); otherwise 1-based.
61    pub zero_based: bool,
62    /// SAM-style: end derived from the CIGAR in column 6.
63    pub sam: bool,
64    /// VCF-style: end derived from REF length / INFO END.
65    pub vcf: bool,
66}
67
68impl Default for Config {
69    /// htslib's default config when no preset is given is GFF (`tbx_conf_gff`).
70    fn default() -> Self {
71        Self::from_preset(Preset::Gff)
72    }
73}
74
75impl Config {
76    /// The upstream `tbx_conf_*` constants from htslib tbx.c, expressed as
77    /// `{preset, sc, bc, ec, meta_char, line_skip}`:
78    /// gff `{0,1,4,5,'#',0}`, bed `{UCSC,1,2,3,'#',0}`,
79    /// sam `{SAM,3,4,0,'@',0}`, vcf `{VCF,1,2,0,'#',0}`.
80    pub fn from_preset(preset: Preset) -> Self {
81        match preset {
82            Preset::Gff => Config {
83                seq_col: 1,
84                begin_col: 4,
85                end_col: 5,
86                comment: b'#',
87                skip: 0,
88                zero_based: false,
89                sam: false,
90                vcf: false,
91            },
92            Preset::Bed => Config {
93                seq_col: 1,
94                begin_col: 2,
95                end_col: 3,
96                comment: b'#',
97                skip: 0,
98                zero_based: true,
99                sam: false,
100                vcf: false,
101            },
102            Preset::Sam => Config {
103                seq_col: 3,
104                begin_col: 4,
105                end_col: 0,
106                comment: b'@',
107                skip: 0,
108                zero_based: false,
109                sam: true,
110                vcf: false,
111            },
112            Preset::Vcf => Config {
113                seq_col: 1,
114                begin_col: 2,
115                end_col: 0,
116                comment: b'#',
117                skip: 0,
118                zero_based: false,
119                sam: false,
120                vcf: true,
121            },
122        }
123    }
124
125    /// The noodles tabix `Format` this config records in the index header, so a
126    /// downstream reader resolves coordinates the same way htslib does.
127    pub fn format(&self) -> Format {
128        if self.sam {
129            Format::Sam
130        } else if self.vcf {
131            Format::Vcf
132        } else if self.zero_based {
133            Format::Generic(CoordinateSystem::Bed)
134        } else {
135            Format::Generic(CoordinateSystem::Gff)
136        }
137    }
138
139    /// Is `line` a comment / header line that htslib excludes from the index?
140    /// Lines within the `skip` count or beginning with the comment char.
141    pub fn is_meta(&self, line_no: u32, line: &[u8]) -> bool {
142        line_no <= self.skip || line.first() == Some(&self.comment)
143    }
144
145    /// Derive a record's 1-based inclusive index interval, mirroring htslib
146    /// `tbx_parse1`. Internal arithmetic is in 0-based half-open `[beg, end)`;
147    /// the returned interval is 1-based inclusive `[beg+1, end]`. The line is
148    /// tokenised in a single forward pass up to the deepest column the layout
149    /// touches, rather than re-scanning from the start per column.
150    pub(crate) fn interval<'a>(&self, line: &'a [u8]) -> io::Result<Interval<'a>> {
151        let needs_info = self.vcf;
152        let needs_cigar = self.sam;
153        let max_col = self
154            .seq_col
155            .max(self.begin_col)
156            .max(self.end_col)
157            .max(if needs_info { 8 } else { 0 })
158            .max(if needs_cigar { 6 } else { 0 })
159            .max(if needs_info { 4 } else { 0 });
160
161        let mut seq: &[u8] = b"";
162        let mut begin: &[u8] = b"";
163        let mut end_field: &[u8] = b"";
164        let mut ref_field: &[u8] = b"";
165        let mut cigar: &[u8] = b"";
166        let mut info: &[u8] = b"";
167
168        let mut rest = line;
169        let mut col = 1;
170        loop {
171            let tab = memchr::memchr(b'\t', rest);
172            let field = match tab {
173                Some(i) => &rest[..i],
174                None => rest,
175            };
176            if col == self.seq_col {
177                seq = field;
178            }
179            if col == self.begin_col {
180                begin = field;
181            }
182            if self.end_col != 0 && col == self.end_col {
183                end_field = field;
184            }
185            if needs_info && col == 4 {
186                ref_field = field;
187            }
188            if needs_cigar && col == 6 {
189                cigar = field;
190            }
191            if needs_info && col == 8 {
192                info = field;
193            }
194            match tab {
195                Some(i) if col < max_col => {
196                    rest = &rest[i + 1..];
197                    col += 1;
198                }
199                _ => break,
200            }
201        }
202
203        if col < max_col || seq.is_empty() {
204            return Err(invalid(format!("line has fewer than {max_col} columns")));
205        }
206
207        let ref_name = seq;
208        let raw_begin = parse_coord(begin)?;
209        let beg = if self.zero_based {
210            raw_begin
211        } else {
212            raw_begin - 1
213        };
214
215        let end = if self.sam {
216            beg + sam_ref_span(cigar)?
217        } else if self.vcf {
218            beg + vcf_ref_span(ref_field, info, beg)?
219        } else if self.end_col >= self.begin_col && self.end_col != 0 {
220            parse_coord(end_field)?
221        } else {
222            beg + 1
223        };
224
225        if beg < 0 {
226            let name = String::from_utf8_lossy(ref_name);
227            return Err(invalid(format!(
228                "negative start coordinate on contig '{name}'"
229            )));
230        }
231        if end < beg {
232            let name = String::from_utf8_lossy(ref_name);
233            return Err(invalid(format!(
234                "end before start on contig '{name}' ({end} < {beg})"
235            )));
236        }
237
238        let start = usize::try_from(beg + 1).map_err(|e| invalid(e.to_string()))?;
239        let end = usize::try_from(end.max(beg + 1)).map_err(|e| invalid(e.to_string()))?;
240
241        Ok(Interval {
242            ref_name,
243            start,
244            end,
245        })
246    }
247}
248
249/// SAM reference span: sum of M/D/N/=/X CIGAR ops. htslib walks the CIGAR
250/// string accumulating consuming-reference operation lengths.
251fn sam_ref_span(cigar: &[u8]) -> io::Result<i64> {
252    if cigar == b"*" {
253        return Ok(1);
254    }
255    let mut span: i64 = 0;
256    let mut num: i64 = 0;
257    let mut saw_digit = false;
258    for &b in cigar {
259        if b.is_ascii_digit() {
260            num = num * 10 + i64::from(b - b'0');
261            saw_digit = true;
262        } else {
263            if !saw_digit {
264                return Err(invalid("malformed CIGAR in SAM record"));
265            }
266            if matches!(b, b'M' | b'D' | b'N' | b'=' | b'X') {
267                span += num;
268            }
269            num = 0;
270            saw_digit = false;
271        }
272    }
273    Ok(span.max(1))
274}
275
276/// VCF reference span end-extension (`tbx_parse1` VCF branch): the reach is
277/// `beg + max(reflen, svlen)`, with an explicit INFO `END=` overriding. REF
278/// length covers SNV/indel; INFO `END`/`SVLEN` cover symbolic SVs.
279fn vcf_ref_span(ref_field: &[u8], info: &[u8], beg: i64) -> io::Result<i64> {
280    let ref_len = ref_field.len() as i64;
281    let mut span = ref_len.max(1);
282
283    if info != b"." {
284        if let Some(end) = info_value(info, b"END") {
285            // INFO END is a 1-based inclusive end; as a half-open 0-based end it
286            // is the value itself. Return it relative to beg so the caller adds
287            // beg back.
288            let end = parse_coord(end)?;
289            return Ok((end - beg).max(span));
290        }
291        if let Some(svlen) = info_value(info, b"SVLEN") {
292            let svlen = parse_coord(svlen)?.abs();
293            span = span.max(svlen);
294        }
295    }
296    Ok(span)
297}
298
299/// Value of `key` in a VCF INFO field (`;`-delimited `KEY=VALUE`).
300fn info_value<'a>(info: &'a [u8], key: &[u8]) -> Option<&'a [u8]> {
301    for entry in info.split(|&b| b == b';') {
302        if let Some(eq) = memchr::memchr(b'=', entry)
303            && &entry[..eq] == key
304        {
305            return Some(&entry[eq + 1..]);
306        }
307    }
308    None
309}