use std::borrow::Cow;
use std::ops::Range;
use memchr::memchr;
use crate::{Block, ChainError, Strand};
#[derive(Debug)]
pub(crate) struct ChainMeta {
pub score: i64,
pub reference_name: Range<usize>,
pub reference_size: u32,
pub reference_strand: Strand,
pub reference_start: u32,
pub reference_end: u32,
pub query_name: Range<usize>,
pub query_size: u32,
pub query_strand: Strand,
pub query_start: u32,
pub query_end: u32,
pub id: u64,
pub blocks: Range<usize>,
}
pub(crate) fn read_line(bytes: &[u8], start: usize) -> (usize, &[u8]) {
if start >= bytes.len() {
return (bytes.len(), &bytes[bytes.len()..]);
}
match memchr(b'\n', &bytes[start..]) {
Some(rel) => {
let end = start + rel;
let mut line = &bytes[start..end];
if let Some(stripped) = line.strip_suffix(b"\r") {
line = stripped;
}
(end + 1, line)
}
None => {
let mut line = &bytes[start..];
if let Some(stripped) = line.strip_suffix(b"\r") {
line = stripped;
}
(bytes.len(), line)
}
}
}
pub(crate) fn is_blank(line: &[u8]) -> bool {
line.iter().all(|b| b.is_ascii_whitespace())
}
#[cfg_attr(not(feature = "parallel"), allow(dead_code))]
pub(crate) fn parse_header(line: &[u8], offset: usize) -> Result<ChainMeta, ChainError> {
let (meta, has_explicit_id) = parse_header_with_default_id(line, offset, 0)?;
if !has_explicit_id {
return Err(ChainError::Format {
offset,
msg: Cow::Borrowed("id missing"),
});
}
Ok(meta)
}
pub(crate) fn parse_header_with_default_id(
line: &[u8],
offset: usize,
default_id: u64,
) -> Result<(ChainMeta, bool), ChainError> {
let mut cursor = TokenCursor::new(line);
let Some((kw_start, kw_end)) = cursor.next() else {
return Err(ChainError::Format {
offset,
msg: Cow::Borrowed("empty header line"),
});
};
if &line[kw_start..kw_end] != b"chain" {
return Err(ChainError::Format {
offset,
msg: Cow::Borrowed("header does not start with 'chain'"),
});
}
let score = parse_i64_token(&mut cursor, line, offset, "score")?;
let reference_name = parse_range_token(&mut cursor, offset, "tName")?;
let reference_size = parse_u32_token(&mut cursor, line, offset, "tSize")?;
let reference_strand = parse_strand_token(&mut cursor, line, offset, "tStrand")?;
let reference_start = parse_u32_token(&mut cursor, line, offset, "tStart")?;
let reference_end = parse_u32_token(&mut cursor, line, offset, "tEnd")?;
let query_name = parse_range_token(&mut cursor, offset, "qName")?;
let query_size = parse_u32_token(&mut cursor, line, offset, "qSize")?;
let query_strand = parse_strand_token(&mut cursor, line, offset, "qStrand")?;
let query_start = parse_u32_token(&mut cursor, line, offset, "qStart")?;
let query_end = parse_u32_token(&mut cursor, line, offset, "qEnd")?;
let (id, has_explicit_id) = match cursor.next() {
Some((s, e)) => (parse_u64(&line[s..e], offset + s, "id")?, true),
None => (default_id, false),
};
Ok((
ChainMeta {
score,
reference_name,
reference_size,
reference_strand,
reference_start,
reference_end,
query_name,
query_size,
query_strand,
query_start,
query_end,
id,
blocks: 0..0,
},
has_explicit_id,
))
}
pub(crate) fn parse_block(line: &[u8], offset: usize) -> Result<Block, ChainError> {
let mut cursor = TokenCursor::new(line);
let size = parse_u32_token(&mut cursor, line, offset, "block size")?;
let maybe_dt = cursor.next();
if let Some((dt_s, dt_e)) = maybe_dt {
let gap_reference = parse_u32(&line[dt_s..dt_e], offset + dt_s, "dt")?;
let Some((dq_s, dq_e)) = cursor.next() else {
return Err(ChainError::Format {
offset,
msg: Cow::Borrowed("block line missing dq value"),
});
};
let gap_query = parse_u32(&line[dq_s..dq_e], offset + dq_s, "dq")?;
Ok(Block {
size,
gap_reference,
gap_query,
})
} else {
Ok(Block {
size,
gap_reference: 0,
gap_query: 0,
})
}
}
#[cfg(feature = "parallel")]
pub(crate) fn parse_chain_in_range(
bytes: &[u8],
range: Range<usize>,
) -> Result<(ChainMeta, Vec<Block>), ChainError> {
let slice = &bytes[range.clone()];
let mut pos = 0usize;
let (next_pos, header_line) = read_line(slice, pos);
let header_offset = range.start + pos;
let mut meta = parse_header(header_line, header_offset)?;
pos = next_pos;
let mut blocks = Vec::new();
while pos < slice.len() {
let block_line_start = pos;
let (next, line) = read_line(slice, pos);
pos = next;
if is_blank(line) {
break;
}
let block = parse_block(line, range.start + block_line_start)?;
blocks.push(block);
}
if blocks.is_empty() {
return Err(ChainError::Format {
offset: range.start,
msg: Cow::Borrowed("chain without any alignment blocks"),
});
}
meta.blocks = 0..blocks.len();
Ok((meta, blocks))
}
pub(crate) struct TokenCursor<'a> {
line: &'a [u8],
pos: usize,
}
impl<'a> TokenCursor<'a> {
fn new(line: &'a [u8]) -> Self {
TokenCursor { line, pos: 0 }
}
fn next(&mut self) -> Option<(usize, usize)> {
let len = self.line.len();
while self.pos < len && self.line[self.pos].is_ascii_whitespace() {
self.pos += 1;
}
if self.pos >= len {
return None;
}
let start = self.pos;
while self.pos < len && !self.line[self.pos].is_ascii_whitespace() {
self.pos += 1;
}
Some((start, self.pos))
}
}
fn parse_i64_token(
cursor: &mut TokenCursor<'_>,
line: &[u8],
offset: usize,
label: &'static str,
) -> Result<i64, ChainError> {
let Some((s, e)) = cursor.next() else {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{label} missing")),
});
};
parse_i64(&line[s..e], offset + s, label)
}
fn parse_u64_token(
cursor: &mut TokenCursor<'_>,
line: &[u8],
offset: usize,
label: &'static str,
) -> Result<u64, ChainError> {
let Some((s, e)) = cursor.next() else {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{label} missing")),
});
};
parse_u64(&line[s..e], offset + s, label)
}
fn parse_u32_token(
cursor: &mut TokenCursor<'_>,
line: &[u8],
offset: usize,
label: &'static str,
) -> Result<u32, ChainError> {
let val = parse_u64_token(cursor, line, offset, label)?;
if val > u32::MAX as u64 {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{label} exceeds u32")),
});
}
Ok(val as u32)
}
fn parse_range_token(
cursor: &mut TokenCursor<'_>,
offset: usize,
label: &'static str,
) -> Result<Range<usize>, ChainError> {
let Some((s, e)) = cursor.next() else {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{label} missing")),
});
};
Ok((offset + s)..(offset + e))
}
fn parse_strand_token(
cursor: &mut TokenCursor<'_>,
line: &[u8],
offset: usize,
label: &'static str,
) -> Result<Strand, ChainError> {
let Some((s, e)) = cursor.next() else {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{label} missing")),
});
};
match line[s] {
b'+' if e - s == 1 => Ok(Strand::Plus),
b'-' if e - s == 1 => Ok(Strand::Minus),
_ => Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{label} must be '+' or '-'")),
}),
}
}
fn parse_u64(data: &[u8], offset: usize, ctx: &str) -> Result<u64, ChainError> {
let mut value: u64 = 0;
if data.is_empty() {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{ctx} is empty")),
});
}
for (i, &b) in data.iter().enumerate() {
let digit = b.wrapping_sub(b'0');
if digit > 9 {
return Err(ChainError::Format {
offset: offset + i,
msg: Cow::Owned(format!("{ctx} contains a non-digit")),
});
}
value = value
.checked_mul(10)
.and_then(|v| v.checked_add(digit as u64))
.ok_or_else(|| ChainError::Format {
offset: offset + i,
msg: Cow::Owned(format!("{ctx} overflows u64")),
})?;
}
Ok(value)
}
fn parse_u32(data: &[u8], offset: usize, ctx: &str) -> Result<u32, ChainError> {
let val = parse_u64(data, offset, ctx)?;
if val > u32::MAX as u64 {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{ctx} exceeds u32")),
});
}
Ok(val as u32)
}
fn parse_i64(data: &[u8], offset: usize, ctx: &str) -> Result<i64, ChainError> {
if data.is_empty() {
return Err(ChainError::Format {
offset,
msg: Cow::Owned(format!("{ctx} is empty")),
});
}
let (negative, digits) = if data[0] == b'-' {
(true, &data[1..])
} else {
(false, data)
};
let unsigned = parse_u64(digits, offset + if negative { 1 } else { 0 }, ctx)?;
if negative {
let val = (unsigned as i64)
.checked_neg()
.ok_or_else(|| ChainError::Format {
offset,
msg: Cow::Owned(format!("{ctx} underflows i64")),
})?;
Ok(val)
} else {
unsigned.try_into().map_err(|_| ChainError::Format {
offset,
msg: Cow::Owned(format!("{ctx} overflows i64")),
})
}
}