use crate::core::tables::LOOKUP_TABLES;
use crate::core::{SeqReader, SeqRecord};
use crate::utils::region::{parse_regions, Region, RegionList};
use anyhow::{Context, Result};
use clap::Args;
use std::io::Write;
#[derive(Args, Debug)]
pub struct SubseqArgs {
#[arg(value_name = "in.fa")]
pub input: String,
#[arg(value_name = "in.bed|name.list")]
pub regions_file: String,
#[arg(short = 't', long)]
pub tab_delimited: bool,
#[arg(short = 's', long)]
pub strand_aware: bool,
#[arg(short = 'l', long, value_name = "INT", default_value = "0")]
pub line_width: usize,
}
pub fn run(args: &SubseqArgs) -> Result<()> {
let regions = parse_regions(&args.regions_file)
.with_context(|| format!("无法读取区域文件: {}", args.regions_file))?;
let mut reader = if args.input == "-" {
SeqReader::from_stdin()
} else {
SeqReader::from_path(&args.input)
.with_context(|| format!("无法打开输入文件: {}", args.input))?
};
let stdout = std::io::stdout();
let mut writer = stdout.lock();
let mut record = SeqRecord::new(Vec::new(), Vec::new());
while reader.read_next(&mut record)? {
if let Some(region_list) = regions.get(&record.name) {
extract_and_output(&mut writer, &record, region_list, args)?;
}
}
writer.flush()?;
Ok(())
}
fn extract_and_output(
writer: &mut impl Write,
record: &SeqRecord,
region_list: &RegionList,
args: &SubseqArgs,
) -> Result<()> {
for region in region_list.iter() {
let beg = region.start;
let mut end = region.end;
if beg >= record.seq.len() {
eprintln!(
"[subseq] {}: {} >= {}",
String::from_utf8_lossy(&record.name),
beg,
record.seq.len()
);
continue;
}
if end > record.seq.len() {
end = record.seq.len();
}
if args.tab_delimited {
output_tab_format(writer, record, beg, end, args)?;
} else {
output_fasta_format(writer, record, region, beg, end, args)?;
}
}
Ok(())
}
fn output_tab_format(
writer: &mut impl Write,
record: &SeqRecord,
beg: usize,
end: usize,
_args: &SubseqArgs,
) -> Result<()> {
write!(
writer,
"{}\t{}\t",
String::from_utf8_lossy(&record.name),
beg + 1
)?;
writer.write_all(&record.seq[beg..end])?;
writeln!(writer)?;
Ok(())
}
fn output_fasta_format(
writer: &mut impl Write,
record: &SeqRecord,
region: Region,
beg: usize,
end: usize,
args: &SubseqArgs,
) -> Result<()> {
let is_fastq = record.qual.is_some();
if !args.tab_delimited {
let prefix = if is_fastq { b'@' } else { b'>' };
write!(writer, "{}", prefix as char)?;
writer.write_all(&record.name)?;
let is_full_seq = beg == 0 && end >= record.seq.len();
if !is_full_seq {
if end >= record.seq.len() {
if beg > 0 {
write!(writer, ":{}", beg + 1)?;
}
} else {
write!(writer, ":{}-{}", beg + 1, end)?;
}
}
if let Some(ref comment) = record.comment {
write!(writer, " ")?;
writer.write_all(comment)?;
}
writeln!(writer)?;
}
let subseq = &record.seq[beg..end];
if args.strand_aware && region.strand.is_some() {
let strand = region.strand.unwrap();
if strand == b'-' {
let rev_comp = reverse_complement(subseq);
output_sequence_lines(writer, &rev_comp, args.line_width)?;
} else {
output_sequence_lines(writer, subseq, args.line_width)?;
}
} else {
output_sequence_lines(writer, subseq, args.line_width)?;
}
if is_fastq && !args.tab_delimited {
if let Some(ref qual) = record.qual {
writeln!(writer, "+")?;
let subqual = &qual[beg..end];
output_sequence_lines(writer, subqual, args.line_width)?;
}
}
Ok(())
}
fn output_sequence_lines(writer: &mut impl Write, seq: &[u8], line_width: usize) -> Result<()> {
if line_width == 0 {
writer.write_all(seq)?;
writeln!(writer)?;
} else {
for (i, &base) in seq.iter().enumerate() {
if i > 0 && i % line_width == 0 {
writeln!(writer)?;
}
write!(writer, "{}", base as char)?;
}
writeln!(writer)?;
}
Ok(())
}
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter()
.rev()
.map(|&b| LOOKUP_TABLES.comp[b as usize])
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reverse_complement() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b"GCTA"), b"TAGC");
}
#[test]
fn test_output_sequence_lines() -> Result<()> {
let mut buffer = Vec::new();
output_sequence_lines(&mut buffer, b"ACGTACGT", 0)?;
assert_eq!(buffer, b"ACGTACGT\n");
buffer.clear();
output_sequence_lines(&mut buffer, b"ACGTACGT", 4)?;
assert_eq!(buffer, b"ACGT\nACGT\n");
Ok(())
}
}