use crate::bgzf::BgzfReader;
use crate::fai::{FaiEntry, FaiIndex};
use crate::gzi::GziIndex;
use crate::FastX::{FastARecord, FastXRead};
use std::fs::File;
use std::io::{self, Read, Seek};
use std::path::Path;
pub struct IndexedFastXReader<R: Read + Seek>
{
reader: BgzfReader<R>,
fai_index: FaiIndex,
}
impl<R: Read + Seek> IndexedFastXReader<R>
{
pub fn new(reader: BgzfReader<R>, fai_index: FaiIndex) -> Self
{
Self { reader, fai_index }
}
}
pub type LocalIndexedFastXReader = IndexedFastXReader<File>;
impl IndexedFastXReader<File>
{
pub fn from_path(path: &Path) -> io::Result<Self>
{
let fai_path = find_index_file(path, "fai").ok_or_else(|| {
io::Error::new(
io::ErrorKind::NotFound,
format!(
"FAI index not found for {} (expected {}.fai or {}.gz.fai)",
path.display(),
path.with_extension("").display(),
path.with_extension("").display(),
),
)
})?;
let fai_index = FaiIndex::from_path(&fai_path)?;
let is_gzip = path.extension().map(|e| e == "gz").unwrap_or(false);
let file = File::open(path)?;
let reader = if is_gzip
{
if let Some(gzi_path) = find_index_file(path, "gzi")
{
let gzi_index = GziIndex::from_path(&gzi_path)?;
BgzfReader::with_index(file, gzi_index)?
}
else
{
return Err(io::Error::new(
io::ErrorKind::NotFound,
format!(
"GZI index not found for compressed file {} (expected {}.gzi)",
path.display(),
path.with_extension("").display()
),
));
}
}
else
{
return Err(io::Error::new(
io::ErrorKind::Unsupported,
"Uncompressed files not yet supported, please use bgzip-compressed files",
));
};
Ok(Self { reader, fai_index })
}
#[cfg(feature = "url")]
pub fn from_url(
data_url: impl Into<String>,
fai_url: impl Into<String>,
gzi_url: impl Into<String>,
) -> io::Result<IndexedFastXReader<crate::remote::RemoteReader>>
{
use crate::remote::RemoteReader;
let fai_url = fai_url.into();
let fai_data = fetch_url(&fai_url)?;
let fai_index = parse_fai_from_bytes(&fai_data)?;
let gzi_url = gzi_url.into();
let gzi_data = fetch_url(&gzi_url)?;
let gzi_index = parse_gzi_from_bytes(&gzi_data)?;
let remote_reader = RemoteReader::new(data_url)?;
let reader = BgzfReader::with_index(remote_reader, gzi_index)?;
Ok(IndexedFastXReader { reader, fai_index })
}
}
impl<R: Read + Seek> IndexedFastXReader<R>
{
pub fn fetch(&mut self, seq_id: &str) -> io::Result<FastARecord>
{
let entry = self.fai_index.get(seq_id).ok_or_else(|| {
io::Error::new(
io::ErrorKind::NotFound,
format!("Sequence '{}' not found in index", seq_id),
)
})?;
let entry = entry.clone();
self.fetch_entry(&entry)
}
pub fn fetch_range(&mut self, seq_id: &str, start: u64, end: u64) -> io::Result<Vec<u8>>
{
let entry = self.fai_index.get(seq_id).ok_or_else(|| {
io::Error::new(
io::ErrorKind::NotFound,
format!("Sequence '{}' not found in index", seq_id),
)
})?;
let entry = entry.clone();
if start >= entry.length
{
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("Start position {} beyond sequence length {}", start, entry.length),
));
}
let clamped_end = end.min(entry.length);
let region_length = clamped_end - start;
let start_offset = entry.offset_for_position(start);
self.reader.seek_uncompressed(start_offset)?;
let mut seq_data = Vec::with_capacity(region_length as usize);
let mut remaining = region_length;
let mut col = start % entry.line_bases;
while remaining > 0
{
if col >= entry.line_bases
{
let padding = entry.line_width - entry.line_bases;
if padding > 0
{
let mut trash = vec![0u8; padding as usize];
self.reader.read_exact(&mut trash)?;
}
col = 0;
}
let in_line = std::cmp::min(remaining, entry.line_bases - col);
let mut buf = vec![0u8; in_line as usize];
let n = self.reader.read(&mut buf)?;
if n == 0
{
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"Unexpected end of file while reading sequence",
));
}
seq_data.extend_from_slice(&buf[..n]);
remaining -= n as u64;
col += n as u64; }
Ok(seq_data)
}
fn fetch_entry(&mut self, entry: &FaiEntry) -> io::Result<FastARecord>
{
const MAX_HEADER_SEARCH: u64 = 4096;
let header_offset = entry.offset.saturating_sub(MAX_HEADER_SEARCH);
self.reader.seek_uncompressed(header_offset)?;
let buffer_len = (entry.offset - header_offset) as usize;
let mut buffer = vec![0u8; buffer_len];
self.reader.read_exact(&mut buffer)?;
let header_start = match buffer.iter().rposition(|&b| b == b'>')
{
Some(pos) => header_offset + pos as u64,
None =>
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("Could not find FASTA header for sequence '{}'", entry.name),
));
}
};
self.reader.seek_uncompressed(header_start)?;
let mut record = FastARecord::default();
record.read(&mut self.reader)?;
Ok(record)
}
pub fn index(&self) -> &FaiIndex
{
&self.fai_index
}
pub fn gzi_index(&self) -> Option<&GziIndex>
{
self.reader.gzi_index()
}
pub fn contains(&self, seq_id: &str) -> bool
{
self.fai_index.contains(seq_id)
}
pub fn sequence_names(&self) -> Vec<&str>
{
self.fai_index.sequence_names().collect()
}
}
#[cfg(feature = "url")]
#[allow(dead_code)]
fn fetch_url(url: &str) -> io::Result<Vec<u8>>
{
let agent = ureq::Agent::new_with_defaults();
let response = agent.get(url).call().map_err(|e| {
io::Error::new(
io::ErrorKind::ConnectionRefused,
format!("HTTP GET request failed for {}: {}", url, e),
)
})?;
let data = response.into_body().read_to_vec().map_err(|e| {
io::Error::new(
io::ErrorKind::ConnectionRefused,
format!("Failed to read response body: {}", e),
)
})?;
Ok(data)
}
#[allow(dead_code)]
fn parse_fai_from_bytes(data: &[u8]) -> io::Result<FaiIndex>
{
use crate::fai::FaiEntry;
use std::collections::HashMap;
let text = std::str::from_utf8(data)
.map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "FAI data is not valid UTF-8"))?;
let mut entries = HashMap::new();
for (line_num, line) in text.lines().enumerate()
{
let line = line.trim();
if line.is_empty() || line.starts_with('#')
{
continue;
}
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() != 5
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Invalid FAI format at line {}: expected 5 fields, got {}",
line_num + 1,
parts.len()
),
));
}
let name = parts[0].to_string();
let length = parts[1].parse::<u64>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid length at line {}: '{}'", line_num + 1, parts[1]),
)
})?;
let offset = parts[2].parse::<u64>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid offset at line {}: '{}'", line_num + 1, parts[2]),
)
})?;
let line_bases = parts[3].parse::<u64>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid line_bases at line {}: '{}'", line_num + 1, parts[3]),
)
})?;
let line_width = parts[4].parse::<u64>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid line_width at line {}: '{}'", line_num + 1, parts[4]),
)
})?;
if line_width < line_bases
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Invalid line_width < line_bases at line {}: {} < {}",
line_num + 1,
line_width,
line_bases
),
));
}
let entry = FaiEntry {
name,
length,
offset,
line_bases,
line_width,
};
entries.insert(entry.name.clone(), entry);
}
Ok(FaiIndex { entries })
}
#[allow(dead_code)]
fn parse_gzi_from_bytes(data: &[u8]) -> io::Result<GziIndex>
{
if data.len() < 8
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"GZI data too short (less than 8 bytes)",
));
}
let num_entries = u64::from_le_bytes([
data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
]) as usize;
let expected_size = 8 + num_entries * 16;
if data.len() < expected_size
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("GZI data too short: expected {} bytes, got {}", expected_size, data.len()),
));
}
let mut entries = Vec::with_capacity(num_entries);
let mut offset = 8;
for _ in 0..num_entries
{
let compressed = u64::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
offset += 8;
let uncompressed = u64::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
offset += 8;
entries.push((compressed, uncompressed));
}
for i in 1..entries.len()
{
if entries[i].1 < entries[i - 1].1
{
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"GZI entries not sorted by uncompressed offset",
));
}
}
Ok(GziIndex { entries })
}
use std::path::PathBuf;
fn find_index_file(path: &Path, ext: &str) -> Option<PathBuf>
{
let stem = path.with_extension("");
let direct = PathBuf::from(format!("{}.{}", path.display(), ext));
if direct.exists()
{
return Some(direct);
}
let stem_index = PathBuf::from(format!("{}.{}", stem.display(), ext));
if stem_index.exists()
{
return Some(stem_index);
}
None
}
#[cfg(test)]
mod tests
{
use super::*;
#[test]
fn test_find_index_file()
{
let fasta_path = Path::new("test_find.fasta.gz");
let fai1 = Path::new("test_find.fasta.gz.fai");
let fai2 = Path::new("test_find.fasta.fai");
std::fs::write(fasta_path, b">test\nACGT\n").unwrap();
std::fs::write(fai1, b"test\t4\t6\n").unwrap();
let result = find_index_file(fasta_path, "fai");
assert!(result.is_some());
assert_eq!(result.unwrap(), fai1);
std::fs::remove_file(fai1).unwrap();
std::fs::write(fai2, b"test\t4\t6\n").unwrap();
let result = find_index_file(fasta_path, "fai");
assert!(result.is_some());
assert_eq!(result.unwrap(), fai2);
std::fs::remove_file(fai2).unwrap();
std::fs::remove_file(fasta_path).unwrap();
}
#[test]
fn test_index_file_not_found()
{
let path = Path::new("nonexistent.fasta.gz");
let result = find_index_file(path, "fai");
assert!(result.is_none());
}
#[test]
fn test_parse_fai_from_bytes()
{
let data = b"chr1\t1000\t0\t80\t81\nchr2\t2000\t1000\t80\t81\n";
let index = parse_fai_from_bytes(data).unwrap();
assert_eq!(index.len(), 2);
assert!(index.contains("chr1"));
assert!(index.contains("chr2"));
let chr1 = index.get("chr1").unwrap();
assert_eq!(chr1.length, 1000);
assert_eq!(chr1.offset, 0);
}
#[test]
fn test_parse_gzi_from_bytes()
{
let data: Vec<u8> = vec![
2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 0, 0, 0, 0, 0, 0, 0, 0, 100, 0, 0, 0, 0, 0, 0, ];
let index = parse_gzi_from_bytes(&data).unwrap();
assert_eq!(index.len(), 2);
assert_eq!(index.get_compressed_offset(0), Some(0));
assert_eq!(index.get_compressed_offset(5000), Some(0));
}
}