use crate::bam_io::create_raw_bam_reader;
use crate::commands::common::parse_bool;
use crate::logging::OperationTimer;
use crate::validation::validate_file_exists;
use anyhow::Result;
use clap::Parser;
use fgumi_raw_bam::{
RawRecord, extract_sequence_into, quality_scores_slice, read_name as raw_read_name,
};
use log::info;
use std::io::{BufWriter, Write, stdout};
use std::path::PathBuf;
use crate::commands::command::Command;
static QUAL_TO_ASCII: [u8; 256] = {
let mut table = [0u8; 256];
let mut i = 0;
while i < 256 {
let val = (i as u8).saturating_add(33);
table[i] = if val > 126 { 126 } else { val };
i += 1;
}
table
};
static COMPLEMENT: [u8; 256] = {
let mut table = [b'N'; 256];
table[b'A' as usize] = b'T';
table[b'a' as usize] = b'T';
table[b'T' as usize] = b'A';
table[b't' as usize] = b'A';
table[b'C' as usize] = b'G';
table[b'c' as usize] = b'G';
table[b'G' as usize] = b'C';
table[b'g' as usize] = b'C';
table[b'N' as usize] = b'N';
table[b'n' as usize] = b'N';
table
};
#[derive(Debug, Parser)]
#[command(
name = "fastq",
about = "\x1b[38;5;72m[ALIGNMENT]\x1b[0m \x1b[36mConvert BAM to FASTQ format\x1b[0m",
long_about = r#"
Convert a BAM file to interleaved FASTQ format.
Reads BAM records and outputs FASTQ to stdout for piping to aligners.
Input should be queryname-sorted or template-coordinate sorted.
EXAMPLES:
# Pipe to bwa mem for alignment
fgumi fastq -i unmapped.bam | bwa mem -t 16 -p -K 150000000 -Y ref.fa -
# With multi-threaded BAM decompression
fgumi fastq -i unmapped.bam -@ 4 | bwa mem -t 16 -p ref.fa -
# Exclude secondary and supplementary alignments (default)
fgumi fastq -i aligned.bam -F 0x900 | bwa mem ...
"#
)]
pub struct Fastq {
#[arg(short = 'i', long = "input")]
pub input: PathBuf,
#[arg(short = 'n', long = "no-read-suffix", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub no_suffix: bool,
#[arg(short = 'F', long = "exclude-flags", default_value_t = 0x900, value_parser = parse_flags)]
pub exclude_flags: u16,
#[arg(short = 'f', long = "require-flags", default_value_t = 0, value_parser = parse_flags)]
pub require_flags: u16,
#[arg(short = '@', short_alias = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[arg(short = 'K', long = "bwa-chunk-size", default_value = "150000000")]
pub bwa_chunk_size: u64,
}
fn parse_flags(s: &str) -> Result<u16, String> {
if s.starts_with("0x") || s.starts_with("0X") {
u16::from_str_radix(&s[2..], 16).map_err(|e| e.to_string())
} else {
s.parse().map_err(|e: std::num::ParseIntError| e.to_string())
}
}
impl Command for Fastq {
fn execute(&self, _command_line: &str) -> Result<()> {
validate_file_exists(&self.input, "Input BAM")?;
let timer = OperationTimer::new("Converting BAM to FASTQ");
info!("Input: {}", self.input.display());
info!("Threads: {}", self.threads);
info!("Exclude flags: 0x{:X}", self.exclude_flags);
info!("Require flags: 0x{:X}", self.require_flags);
info!("Read name suffix: {}", if self.no_suffix { "disabled" } else { "enabled" });
info!("BWA chunk size: {} bases", self.bwa_chunk_size);
let (mut reader, _header) = create_raw_bam_reader(&self.input, self.threads)?;
let mut writer = BufWriter::with_capacity(64 * 1024 * 1024, stdout().lock());
let mut total_records: u64 = 0;
let mut written_records: u64 = 0;
let mut bases_this_batch: u64 = 0;
let mut records_this_batch: usize = 0;
let mut seq_buf: Vec<u8> = Vec::with_capacity(512);
let mut qual_buf: Vec<u8> = Vec::with_capacity(512);
let mut record = RawRecord::new();
loop {
let n = reader.read_record(&mut record)?;
if n == 0 {
break; }
total_records += 1;
let flags = record.flags();
if (flags & self.exclude_flags) != 0 {
continue;
}
if (flags & self.require_flags) != self.require_flags {
continue;
}
let seq_len = record.l_seq() as usize;
write_fastq_record(
&mut writer,
&record,
flags,
self.no_suffix,
&mut seq_buf,
&mut qual_buf,
)?;
written_records += 1;
bases_this_batch += seq_len as u64;
records_this_batch += 1;
if bases_this_batch >= self.bwa_chunk_size && records_this_batch.is_multiple_of(2) {
writer.flush()?;
bases_this_batch = 0;
records_this_batch = 0;
}
}
writer.flush()?;
info!("Read {total_records} records, wrote {written_records} FASTQ records");
timer.log_completion(written_records);
Ok(())
}
}
#[inline]
fn write_fastq_record<W: Write>(
writer: &mut W,
record: &RawRecord,
flags: u16,
no_suffix: bool,
seq_buf: &mut Vec<u8>,
qual_buf: &mut Vec<u8>,
) -> Result<()> {
use fgumi_raw_bam::flags as flag_bits;
let name = raw_read_name(record);
let is_first = (flags & flag_bits::FIRST_SEGMENT) != 0;
let is_last = (flags & flag_bits::LAST_SEGMENT) != 0;
let suffix: &[u8] = if no_suffix {
b""
} else if is_first && !is_last {
b"/1"
} else if is_last && !is_first {
b"/2"
} else {
b"" };
extract_sequence_into(record, seq_buf);
qual_buf.clear();
qual_buf.extend(quality_scores_slice(record).iter().map(|&s| QUAL_TO_ASCII[s as usize]));
if (flags & flag_bits::REVERSE) != 0 {
seq_buf.reverse();
for base in seq_buf.iter_mut() {
*base = COMPLEMENT[*base as usize];
}
qual_buf.reverse();
}
writer.write_all(b"@")?;
writer.write_all(name)?;
writer.write_all(suffix)?;
writer.write_all(b"\n")?;
writer.write_all(seq_buf)?;
writer.write_all(b"\n+\n")?;
writer.write_all(qual_buf)?;
writer.write_all(b"\n")?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn write_reverse_complement_bytes<W: Write>(writer: &mut W, bases: &[u8]) -> Result<()> {
for &base in bases.iter().rev() {
let comp = match base {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'C' | b'c' => b'G',
b'G' | b'g' => b'C',
b'N' | b'n' => b'N',
_ => b'N',
};
writer.write_all(&[comp])?;
}
Ok(())
}
fn write_quality_bytes<W: Write>(writer: &mut W, quals: &[u8]) -> Result<()> {
for &score in quals {
let ascii = score.saturating_add(33).min(126);
writer.write_all(&[ascii])?;
}
Ok(())
}
fn write_reversed_quality_bytes<W: Write>(writer: &mut W, quals: &[u8]) -> Result<()> {
for &score in quals.iter().rev() {
let ascii = score.saturating_add(33).min(126);
writer.write_all(&[ascii])?;
}
Ok(())
}
#[test]
fn test_parse_flags_decimal() {
assert_eq!(parse_flags("2304").expect("parse decimal '2304' should succeed"), 2304);
assert_eq!(parse_flags("0").expect("parse decimal '0' should succeed"), 0);
assert_eq!(parse_flags("65535").expect("parse decimal '65535' should succeed"), 65535);
}
#[test]
fn test_parse_flags_hex() {
assert_eq!(parse_flags("0x900").expect("parse hex '0x900' should succeed"), 0x900);
assert_eq!(parse_flags("0X900").expect("parse hex '0X900' should succeed"), 0x900);
assert_eq!(parse_flags("0xff").expect("parse hex '0xff' should succeed"), 0xff);
assert_eq!(parse_flags("0xFFFF").expect("parse hex '0xFFFF' should succeed"), 0xFFFF);
}
#[test]
fn test_parse_flags_invalid() {
assert!(parse_flags("invalid").is_err());
assert!(parse_flags("0xGGGG").is_err());
assert!(parse_flags("-1").is_err());
}
#[test]
fn test_qual_to_ascii_lookup_table() {
assert_eq!(QUAL_TO_ASCII[0], 33);
assert_eq!(QUAL_TO_ASCII[30], 63);
assert_eq!(QUAL_TO_ASCII[40], 73);
assert_eq!(QUAL_TO_ASCII[93], 126);
assert_eq!(QUAL_TO_ASCII[94], 126);
assert_eq!(QUAL_TO_ASCII[255], 126);
}
#[test]
fn test_complement_lookup_table() {
assert_eq!(COMPLEMENT[b'A' as usize], b'T');
assert_eq!(COMPLEMENT[b'T' as usize], b'A');
assert_eq!(COMPLEMENT[b'C' as usize], b'G');
assert_eq!(COMPLEMENT[b'G' as usize], b'C');
assert_eq!(COMPLEMENT[b'N' as usize], b'N');
assert_eq!(COMPLEMENT[b'a' as usize], b'T');
assert_eq!(COMPLEMENT[b't' as usize], b'A');
assert_eq!(COMPLEMENT[b'c' as usize], b'G');
assert_eq!(COMPLEMENT[b'g' as usize], b'C');
assert_eq!(COMPLEMENT[b'n' as usize], b'N');
assert_eq!(COMPLEMENT[b'X' as usize], b'N');
assert_eq!(COMPLEMENT[0], b'N');
}
#[test]
fn test_write_reverse_complement_bytes() {
let mut output = Vec::new();
write_reverse_complement_bytes(&mut output, b"ACGT")
.expect("write_reverse_complement_bytes should succeed");
assert_eq!(output, b"ACGT");
output.clear();
write_reverse_complement_bytes(&mut output, b"AAAA")
.expect("write_reverse_complement_bytes should succeed");
assert_eq!(output, b"TTTT");
output.clear();
write_reverse_complement_bytes(&mut output, b"ATCG")
.expect("write_reverse_complement_bytes should succeed");
assert_eq!(output, b"CGAT");
output.clear();
write_reverse_complement_bytes(&mut output, b"ANCG")
.expect("write_reverse_complement_bytes should succeed");
assert_eq!(output, b"CGNT");
}
#[test]
fn test_write_quality_bytes() {
let mut output = Vec::new();
write_quality_bytes(&mut output, &[0, 30, 40]).expect("write_quality_bytes should succeed");
assert_eq!(output, vec![33, 63, 73]);
}
#[test]
fn test_write_reversed_quality_bytes() {
let mut output = Vec::new();
write_reversed_quality_bytes(&mut output, &[0, 30, 40])
.expect("write_reversed_quality_bytes should succeed");
assert_eq!(output, vec![73, 63, 33]);
}
#[test]
fn test_quality_encoding_edge_cases() {
let mut output = Vec::new();
write_quality_bytes(&mut output, &[93]).expect("write_quality_bytes should succeed");
assert_eq!(output, vec![126]);
output.clear();
write_quality_bytes(&mut output, &[94, 100, 255])
.expect("write_quality_bytes should succeed");
assert_eq!(output, vec![126, 126, 126]);
}
}