1use std::io;
2use std::str::FromStr;
3
4use noodles::csi::binning_index::index::header::format::{CoordinateSystem, Format};
5
6use crate::{Interval, invalid, parse_coord};
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum IndexKind {
11 Tbi,
13 Csi,
15}
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum Preset {
22 Gff,
23 Bed,
24 Sam,
25 Vcf,
26}
27
28impl FromStr for Preset {
29 type Err = String;
30
31 fn from_str(s: &str) -> Result<Self, Self::Err> {
32 match s {
33 "gff" => Ok(Self::Gff),
34 "bed" => Ok(Self::Bed),
35 "sam" => Ok(Self::Sam),
36 "vcf" => Ok(Self::Vcf),
37 other => Err(format!(
38 "unknown preset '{other}' (expected gff, bed, sam, or vcf)"
39 )),
40 }
41 }
42}
43
44#[derive(Debug, Clone, Copy)]
49pub struct Config {
50 pub seq_col: usize,
52 pub begin_col: usize,
54 pub end_col: usize,
56 pub comment: u8,
58 pub skip: u32,
60 pub zero_based: bool,
62 pub sam: bool,
64 pub vcf: bool,
66}
67
68impl Default for Config {
69 fn default() -> Self {
71 Self::from_preset(Preset::Gff)
72 }
73}
74
75impl Config {
76 pub fn from_preset(preset: Preset) -> Self {
81 match preset {
82 Preset::Gff => Config {
83 seq_col: 1,
84 begin_col: 4,
85 end_col: 5,
86 comment: b'#',
87 skip: 0,
88 zero_based: false,
89 sam: false,
90 vcf: false,
91 },
92 Preset::Bed => Config {
93 seq_col: 1,
94 begin_col: 2,
95 end_col: 3,
96 comment: b'#',
97 skip: 0,
98 zero_based: true,
99 sam: false,
100 vcf: false,
101 },
102 Preset::Sam => Config {
103 seq_col: 3,
104 begin_col: 4,
105 end_col: 0,
106 comment: b'@',
107 skip: 0,
108 zero_based: false,
109 sam: true,
110 vcf: false,
111 },
112 Preset::Vcf => Config {
113 seq_col: 1,
114 begin_col: 2,
115 end_col: 0,
116 comment: b'#',
117 skip: 0,
118 zero_based: false,
119 sam: false,
120 vcf: true,
121 },
122 }
123 }
124
125 pub fn format(&self) -> Format {
128 if self.sam {
129 Format::Sam
130 } else if self.vcf {
131 Format::Vcf
132 } else if self.zero_based {
133 Format::Generic(CoordinateSystem::Bed)
134 } else {
135 Format::Generic(CoordinateSystem::Gff)
136 }
137 }
138
139 pub fn is_meta(&self, line_no: u32, line: &[u8]) -> bool {
142 line_no <= self.skip || line.first() == Some(&self.comment)
143 }
144
145 pub(crate) fn interval<'a>(&self, line: &'a [u8]) -> io::Result<Interval<'a>> {
151 let needs_info = self.vcf;
152 let needs_cigar = self.sam;
153 let max_col = self
154 .seq_col
155 .max(self.begin_col)
156 .max(self.end_col)
157 .max(if needs_info { 8 } else { 0 })
158 .max(if needs_cigar { 6 } else { 0 })
159 .max(if needs_info { 4 } else { 0 });
160
161 let mut seq: &[u8] = b"";
162 let mut begin: &[u8] = b"";
163 let mut end_field: &[u8] = b"";
164 let mut ref_field: &[u8] = b"";
165 let mut cigar: &[u8] = b"";
166 let mut info: &[u8] = b"";
167
168 let mut rest = line;
169 let mut col = 1;
170 loop {
171 let tab = memchr::memchr(b'\t', rest);
172 let field = match tab {
173 Some(i) => &rest[..i],
174 None => rest,
175 };
176 if col == self.seq_col {
177 seq = field;
178 }
179 if col == self.begin_col {
180 begin = field;
181 }
182 if self.end_col != 0 && col == self.end_col {
183 end_field = field;
184 }
185 if needs_info && col == 4 {
186 ref_field = field;
187 }
188 if needs_cigar && col == 6 {
189 cigar = field;
190 }
191 if needs_info && col == 8 {
192 info = field;
193 }
194 match tab {
195 Some(i) if col < max_col => {
196 rest = &rest[i + 1..];
197 col += 1;
198 }
199 _ => break,
200 }
201 }
202
203 if col < max_col || seq.is_empty() {
204 return Err(invalid(format!("line has fewer than {max_col} columns")));
205 }
206
207 let ref_name = seq;
208 let raw_begin = parse_coord(begin)?;
209 let beg = if self.zero_based {
210 raw_begin
211 } else {
212 raw_begin - 1
213 };
214
215 let end = if self.sam {
216 beg + sam_ref_span(cigar)?
217 } else if self.vcf {
218 beg + vcf_ref_span(ref_field, info, beg)?
219 } else if self.end_col >= self.begin_col && self.end_col != 0 {
220 parse_coord(end_field)?
221 } else {
222 beg + 1
223 };
224
225 if beg < 0 {
226 let name = String::from_utf8_lossy(ref_name);
227 return Err(invalid(format!(
228 "negative start coordinate on contig '{name}'"
229 )));
230 }
231 if end < beg {
232 let name = String::from_utf8_lossy(ref_name);
233 return Err(invalid(format!(
234 "end before start on contig '{name}' ({end} < {beg})"
235 )));
236 }
237
238 let start = usize::try_from(beg + 1).map_err(|e| invalid(e.to_string()))?;
239 let end = usize::try_from(end.max(beg + 1)).map_err(|e| invalid(e.to_string()))?;
240
241 Ok(Interval {
242 ref_name,
243 start,
244 end,
245 })
246 }
247}
248
249fn sam_ref_span(cigar: &[u8]) -> io::Result<i64> {
252 if cigar == b"*" {
253 return Ok(1);
254 }
255 let mut span: i64 = 0;
256 let mut num: i64 = 0;
257 let mut saw_digit = false;
258 for &b in cigar {
259 if b.is_ascii_digit() {
260 num = num * 10 + i64::from(b - b'0');
261 saw_digit = true;
262 } else {
263 if !saw_digit {
264 return Err(invalid("malformed CIGAR in SAM record"));
265 }
266 if matches!(b, b'M' | b'D' | b'N' | b'=' | b'X') {
267 span += num;
268 }
269 num = 0;
270 saw_digit = false;
271 }
272 }
273 Ok(span.max(1))
274}
275
276fn vcf_ref_span(ref_field: &[u8], info: &[u8], beg: i64) -> io::Result<i64> {
280 let ref_len = ref_field.len() as i64;
281 let mut span = ref_len.max(1);
282
283 if info != b"." {
284 if let Some(end) = info_value(info, b"END") {
285 let end = parse_coord(end)?;
289 return Ok((end - beg).max(span));
290 }
291 if let Some(svlen) = info_value(info, b"SVLEN") {
292 let svlen = parse_coord(svlen)?.abs();
293 span = span.max(svlen);
294 }
295 }
296 Ok(span)
297}
298
299fn info_value<'a>(info: &'a [u8], key: &[u8]) -> Option<&'a [u8]> {
301 for entry in info.split(|&b| b == b';') {
302 if let Some(eq) = memchr::memchr(b'=', entry)
303 && &entry[..eq] == key
304 {
305 return Some(&entry[eq + 1..]);
306 }
307 }
308 None
309}