use crate::lexer;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FastqRecord<'a> {
pub name: &'a [u8],
pub sequence: &'a [u8],
pub quality: &'a [u8],
}
#[must_use]
#[allow(clippy::missing_panics_doc)]
pub fn find_record_offsets(data: &[u8]) -> Vec<usize> {
if data.is_empty() {
return vec![0];
}
let mut offsets = Vec::with_capacity(data.len() / 300 + 2);
offsets.push(0);
let mut newline_count: u32 = 0;
let num_full_blocks = data.len() / 64;
let remainder = data.len() % 64;
for block_idx in 0..num_full_blocks {
let block_start = block_idx * 64;
let block: &[u8; 64] =
data[block_start..block_start + 64].try_into().expect("slice is exactly 64 bytes");
let newlines = lexer::lex_block(block);
process_bitmask(newlines, block_start, &mut newline_count, &mut offsets);
}
if remainder > 0 {
let block_start = num_full_blocks * 64;
let mut padded = [0u8; 64];
padded[..remainder].copy_from_slice(&data[block_start..]);
let newlines = lexer::lex_block(&padded);
let valid_mask = if remainder < 64 { (1u64 << remainder) - 1 } else { u64::MAX };
let masked_newlines = newlines & valid_mask;
process_bitmask(masked_newlines, block_start, &mut newline_count, &mut offsets);
}
offsets
}
#[inline]
fn process_bitmask(
mut newlines: u64,
block_start: usize,
newline_count: &mut u32,
offsets: &mut Vec<usize>,
) {
while newlines != 0 {
let bit_pos = newlines.trailing_zeros() as usize;
*newline_count += 1;
if (*newline_count).is_multiple_of(4) {
offsets.push(block_start + bit_pos + 1);
}
newlines &= newlines - 1;
}
}
pub fn parse_records(data: &[u8]) -> impl Iterator<Item = FastqRecord<'_>> {
let offsets = find_record_offsets(data);
RecordIter { data, offsets, idx: 0 }
}
struct RecordIter<'a> {
data: &'a [u8],
offsets: Vec<usize>,
idx: usize,
}
impl<'a> Iterator for RecordIter<'a> {
type Item = FastqRecord<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.idx + 1 >= self.offsets.len() {
return None;
}
let start = self.offsets[self.idx];
let end = self.offsets[self.idx + 1];
self.idx += 1;
Some(parse_single_record(&self.data[start..end]))
}
}
pub(crate) fn parse_single_record(record: &[u8]) -> FastqRecord<'_> {
assert!(!record.is_empty() && record[0] == b'@', "FASTQ record must start with @");
let mut newline_positions = [0usize; 3];
let mut count = 0;
for (i, &byte) in record.iter().enumerate() {
if byte == b'\n' {
if count < 3 {
newline_positions[count] = i;
count += 1;
} else {
break;
}
}
}
assert_eq!(count, 3, "FASTQ record must have at least 3 internal newlines");
let name_end = newline_positions[0];
let seq_end = newline_positions[1];
let plus_end = newline_positions[2];
let name = &record[1..name_end];
let sequence = &record[name_end + 1..seq_end];
let qual_start = plus_end + 1;
let qual_end = if record.last() == Some(&b'\n') { record.len() - 1 } else { record.len() };
let quality = &record[qual_start..qual_end];
FastqRecord { name, sequence, quality }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_input() {
let offsets = find_record_offsets(b"");
assert_eq!(offsets, vec![0]);
}
#[test]
fn test_single_record() {
let data = b"@r1\nACGT\n+\nIIII\n";
let offsets = find_record_offsets(data);
assert_eq!(offsets, vec![0, 16]);
}
#[test]
fn test_two_records() {
let data = b"@r1\nACGT\n+\nIIII\n@r2\nTTTT\n+\nJJJJ\n";
let offsets = find_record_offsets(data);
assert_eq!(offsets, vec![0, 16, 32]);
}
#[test]
fn test_incomplete_trailing_record() {
let data = b"@r1\nACGT\n+\nIIII\n@r2\nTT";
let offsets = find_record_offsets(data);
assert_eq!(offsets, vec![0, 16]);
}
#[test]
fn test_single_base_reads() {
let data = b"@r\nA\n+\nI\n";
assert_eq!(data.len(), 9);
let offsets = find_record_offsets(data);
assert_eq!(offsets, vec![0, 9]);
}
#[test]
fn test_parse_single_record() {
let data = b"@read1\nACGT\n+\nIIII\n";
let records: Vec<_> = parse_records(data).collect();
assert_eq!(records.len(), 1);
assert_eq!(records[0].name, b"read1");
assert_eq!(records[0].sequence, b"ACGT");
assert_eq!(records[0].quality, b"IIII");
}
#[test]
fn test_parse_multiple_records() {
let data = b"@r1\nACGT\n+\nIIII\n@r2\nTTTT\n+\nJJJJ\n";
let records: Vec<_> = parse_records(data).collect();
assert_eq!(records.len(), 2);
assert_eq!(records[0].name, b"r1");
assert_eq!(records[0].sequence, b"ACGT");
assert_eq!(records[0].quality, b"IIII");
assert_eq!(records[1].name, b"r2");
assert_eq!(records[1].sequence, b"TTTT");
assert_eq!(records[1].quality, b"JJJJ");
}
#[test]
fn test_record_spanning_block_boundary() {
let name = "X".repeat(60);
let data = format!("@{name}\nACGT\n+\nIIII\n");
let offsets = find_record_offsets(data.as_bytes());
assert_eq!(offsets, vec![0, data.len()]);
let records: Vec<_> = parse_records(data.as_bytes()).collect();
assert_eq!(records.len(), 1);
assert_eq!(records[0].name, name.as_bytes());
assert_eq!(records[0].sequence, b"ACGT");
assert_eq!(records[0].quality, b"IIII");
}
#[test]
fn test_long_sequence_spanning_multiple_blocks() {
let seq = "A".repeat(200);
let qual = "I".repeat(200);
let data = format!("@r1\n{seq}\n+\n{qual}\n");
let offsets = find_record_offsets(data.as_bytes());
assert_eq!(offsets, vec![0, data.len()]);
let records: Vec<_> = parse_records(data.as_bytes()).collect();
assert_eq!(records.len(), 1);
assert_eq!(records[0].sequence.len(), 200);
}
#[test]
fn test_n_bases_and_mixed_case() {
let data = b"@r1\nAcGtNn\n+\nIIIIII\n";
let records: Vec<_> = parse_records(data).collect();
assert_eq!(records[0].sequence, b"AcGtNn");
}
#[test]
fn test_many_small_records() {
let mut data = Vec::new();
for i in 0..10 {
data.extend_from_slice(format!("@r{i}\nA\n+\nI\n").as_bytes());
}
let offsets = find_record_offsets(&data);
assert_eq!(offsets.len(), 11); let records: Vec<_> = parse_records(&data).collect();
assert_eq!(records.len(), 10);
for (i, rec) in records.iter().enumerate() {
assert_eq!(rec.name, format!("r{i}").as_bytes());
}
}
#[test]
fn test_exactly_64_bytes() {
let name = "X".repeat(56);
let data = format!("@{name}\nA\n+\nI\n");
assert_eq!(data.len(), 64);
let offsets = find_record_offsets(data.as_bytes());
assert_eq!(offsets, vec![0, 64]);
}
#[test]
fn test_no_complete_records() {
let data = b"@r1\nACGT\n+\n";
let offsets = find_record_offsets(data);
assert_eq!(offsets, vec![0]);
}
}