use std::io::{self, Read};
use std::path::Path;
use cyanea_core::{CyaneaError, Result};
use cyanea_omics::variant::{Variant, VariantFilter};
use crate::bgzf;
use crate::vcf::VcfStats;
const BCF_MAGIC: [u8; 5] = [b'B', b'C', b'F', 0x02, 0x01];
pub fn parse_bcf(path: impl AsRef<Path>) -> Result<Vec<Variant>> {
let path = path.as_ref();
let file = std::fs::File::open(path).map_err(|e| {
CyaneaError::Io(io::Error::new(
e.kind(),
format!("{}: {}", path.display(), e),
))
})?;
let mut reader = io::BufReader::new(file);
let data = bgzf::decompress_all(&mut reader)?;
if data.len() < 9 {
return Err(CyaneaError::Parse(format!(
"{}: BCF data too short",
path.display()
)));
}
let mut pos = 0;
if data[pos..pos + 5] != BCF_MAGIC {
return Err(CyaneaError::Parse(format!(
"{}: not a valid BCF file (bad magic)",
path.display()
)));
}
pos += 5;
let header_len = read_u32_le(&data, &mut pos)? as usize;
if pos + header_len > data.len() {
return Err(CyaneaError::Parse(format!(
"{}: BCF header truncated",
path.display()
)));
}
let header_text = std::str::from_utf8(&data[pos..pos + header_len])
.map_err(|_| CyaneaError::Parse(format!("{}: invalid UTF-8 in BCF header", path.display())))?;
pos += header_len;
let contigs = parse_header_contigs(header_text);
let filters = parse_header_filters(header_text);
let mut variants = Vec::new();
while pos < data.len() {
if pos + 8 > data.len() {
break;
}
let variant = parse_bcf_record(&data, &mut pos, &contigs, &filters)?;
variants.push(variant);
}
Ok(variants)
}
pub fn bcf_stats(path: impl AsRef<Path>) -> Result<VcfStats> {
let variants = parse_bcf(path)?;
let mut snv_count: u64 = 0;
let mut indel_count: u64 = 0;
let mut pass_count: u64 = 0;
let mut chroms = Vec::new();
let mut seen = std::collections::HashSet::new();
for v in &variants {
if v.is_snv() {
snv_count += 1;
}
if v.is_indel() {
indel_count += 1;
}
if v.filter == VariantFilter::Pass {
pass_count += 1;
}
if seen.insert(v.chrom.clone()) {
chroms.push(v.chrom.clone());
}
}
Ok(VcfStats {
variant_count: variants.len() as u64,
snv_count,
indel_count,
pass_count,
chromosomes: chroms,
})
}
fn read_u32_le(data: &[u8], pos: &mut usize) -> Result<u32> {
if *pos + 4 > data.len() {
return Err(CyaneaError::Parse("unexpected end of BCF data".into()));
}
let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
*pos += 4;
Ok(val)
}
fn read_i32_le(data: &[u8], pos: &mut usize) -> Result<i32> {
if *pos + 4 > data.len() {
return Err(CyaneaError::Parse("unexpected end of BCF data".into()));
}
let val = i32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
*pos += 4;
Ok(val)
}
fn read_f32_le(data: &[u8], pos: &mut usize) -> Result<f32> {
if *pos + 4 > data.len() {
return Err(CyaneaError::Parse("unexpected end of BCF data".into()));
}
let val = f32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
*pos += 4;
Ok(val)
}
fn parse_header_contigs(header: &str) -> Vec<String> {
let mut contigs = Vec::new();
for line in header.lines() {
if line.starts_with("##contig=<") {
if let Some(rest) = line.strip_prefix("##contig=<") {
let rest = rest.trim_end_matches('>');
for field in rest.split(',') {
if let Some(id) = field.strip_prefix("ID=") {
contigs.push(id.to_string());
break;
}
}
}
}
}
contigs
}
fn parse_header_filters(header: &str) -> Vec<String> {
let mut filters = Vec::new();
for line in header.lines() {
if line.starts_with("##FILTER=<") {
if let Some(rest) = line.strip_prefix("##FILTER=<") {
let rest = rest.trim_end_matches('>');
for field in rest.split(',') {
if let Some(id) = field.strip_prefix("ID=") {
filters.push(id.to_string());
break;
}
}
}
}
}
filters
}
fn read_typed_string(data: &[u8], pos: &mut usize) -> Result<String> {
if *pos >= data.len() {
return Ok(String::new());
}
let type_byte = data[*pos];
*pos += 1;
let type_id = type_byte & 0x0F;
let mut count = ((type_byte >> 4) & 0x0F) as usize;
if count == 15 {
if *pos >= data.len() {
return Ok(String::new());
}
let count_type_byte = data[*pos];
*pos += 1;
let count_type = count_type_byte & 0x0F;
count = match count_type {
1 => {
if *pos >= data.len() {
return Ok(String::new());
}
let v = data[*pos] as usize;
*pos += 1;
v
}
2 => {
if *pos + 2 > data.len() {
return Ok(String::new());
}
let v = u16::from_le_bytes([data[*pos], data[*pos + 1]]) as usize;
*pos += 2;
v
}
3 => {
if *pos + 4 > data.len() {
return Ok(String::new());
}
let v = u32::from_le_bytes([
data[*pos],
data[*pos + 1],
data[*pos + 2],
data[*pos + 3],
]) as usize;
*pos += 4;
v
}
_ => 0,
};
}
if type_id == 7 {
if *pos + count > data.len() {
count = data.len() - *pos;
}
let s = std::str::from_utf8(&data[*pos..*pos + count])
.unwrap_or("")
.trim_end_matches('\0')
.to_string();
*pos += count;
Ok(s)
} else {
Ok(String::new())
}
}
fn read_typed_int_vec(data: &[u8], pos: &mut usize) -> Result<Vec<i32>> {
if *pos >= data.len() {
return Ok(Vec::new());
}
let type_byte = data[*pos];
*pos += 1;
let type_id = type_byte & 0x0F;
let mut count = ((type_byte >> 4) & 0x0F) as usize;
if count == 15 {
if *pos >= data.len() {
return Ok(Vec::new());
}
let count_type_byte = data[*pos];
*pos += 1;
let count_type = count_type_byte & 0x0F;
count = match count_type {
1 => {
if *pos >= data.len() { return Ok(Vec::new()); }
let v = data[*pos] as usize;
*pos += 1;
v
}
2 => {
if *pos + 2 > data.len() { return Ok(Vec::new()); }
let v = u16::from_le_bytes([data[*pos], data[*pos + 1]]) as usize;
*pos += 2;
v
}
3 => {
if *pos + 4 > data.len() { return Ok(Vec::new()); }
let v = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]) as usize;
*pos += 4;
v
}
_ => 0,
};
}
let mut values = Vec::with_capacity(count);
for _ in 0..count {
match type_id {
1 => {
if *pos >= data.len() { break; }
values.push(data[*pos] as i8 as i32);
*pos += 1;
}
2 => {
if *pos + 2 > data.len() { break; }
let v = i16::from_le_bytes([data[*pos], data[*pos + 1]]);
values.push(v as i32);
*pos += 2;
}
3 => {
if *pos + 4 > data.len() { break; }
let v = i32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
values.push(v);
*pos += 4;
}
_ => break,
}
}
Ok(values)
}
fn parse_bcf_record(
data: &[u8],
pos: &mut usize,
contigs: &[String],
filters: &[String],
) -> Result<Variant> {
let l_shared = read_u32_le(data, pos)? as usize;
let l_indiv = read_u32_le(data, pos)? as usize;
let record_start = *pos;
let chrom_idx = read_i32_le(data, pos)?;
let position = read_i32_le(data, pos)?; let _rlen = read_i32_le(data, pos)?;
let qual = read_f32_le(data, pos)?;
if *pos + 4 > data.len() {
return Err(CyaneaError::Parse("BCF record truncated".into()));
}
let n_info_allele = read_u32_le(data, pos)?;
let _n_info = (n_info_allele & 0xFFFF) as u16;
let n_allele = (n_info_allele >> 16) as u16;
let _n_fmt_sample = read_u32_le(data, pos)?;
let id_str = read_typed_string(data, pos)?;
let id = if id_str.is_empty() || id_str == "." {
None
} else {
Some(id_str)
};
let mut alleles = Vec::with_capacity(n_allele as usize);
for _ in 0..n_allele {
let allele = read_typed_string(data, pos)?;
alleles.push(allele);
}
let filter_indices = read_typed_int_vec(data, pos)?;
let consumed = *pos - record_start;
let total = l_shared + l_indiv;
if consumed < total {
*pos = record_start + total;
}
let chrom = if chrom_idx >= 0 && (chrom_idx as usize) < contigs.len() {
contigs[chrom_idx as usize].clone()
} else {
format!("chr{}", chrom_idx)
};
let ref_allele = alleles
.first()
.map(|a| a.as_bytes().to_vec())
.unwrap_or_default();
let alt_alleles: Vec<Vec<u8>> = alleles
.iter()
.skip(1)
.filter(|a| !a.is_empty() && *a != ".")
.map(|a| a.as_bytes().to_vec())
.collect();
let quality = if qual.is_nan() || qual == f32::from_bits(0x7F80_0001) {
None
} else {
Some(qual as f64)
};
let filter = if filter_indices.is_empty() {
VariantFilter::Missing
} else if filter_indices == [0] {
VariantFilter::Pass
} else {
let filter_names: Vec<String> = filter_indices
.iter()
.map(|&idx| {
if idx >= 0 && (idx as usize) < filters.len() {
filters[idx as usize].clone()
} else {
format!("FILTER{}", idx)
}
})
.collect();
if filter_names.iter().any(|f| f == "PASS") {
VariantFilter::Pass
} else {
VariantFilter::Fail(filter_names)
}
};
let vcf_position = if position < 0 { 0u64 } else { (position + 1) as u64 };
Ok(Variant {
chrom,
position: vcf_position,
id,
ref_allele,
alt_alleles,
quality,
filter,
})
}
#[cfg(test)]
mod test_helpers {
use std::io::Write;
pub use crate::bgzf::test_helpers::{bgzf_compress, bgzf_eof_block};
pub fn encode_typed_string(s: &str) -> Vec<u8> {
let bytes = s.as_bytes();
let mut out = Vec::new();
let len = bytes.len();
if len < 15 {
out.push(((len as u8) << 4) | 7); } else {
out.push((15 << 4) | 7); if len < 128 {
out.push((1 << 4) | 1); out.push(len as u8);
} else {
out.push((1 << 4) | 2); out.extend_from_slice(&(len as u16).to_le_bytes());
}
}
out.extend_from_slice(bytes);
out
}
pub fn encode_typed_int8_vec(values: &[i8]) -> Vec<u8> {
let mut out = Vec::new();
let len = values.len();
if len < 15 {
out.push(((len as u8) << 4) | 1); } else {
out.push((15 << 4) | 1);
out.push((1 << 4) | 1);
out.push(len as u8);
}
for &v in values {
out.push(v as u8);
}
out
}
pub fn build_bcf_record(
chrom_idx: i32,
pos: i32, qual: f32,
id: &str,
alleles: &[&str],
filter_indices: &[i8],
) -> Vec<u8> {
let mut shared = Vec::new();
shared.extend_from_slice(&chrom_idx.to_le_bytes());
shared.extend_from_slice(&pos.to_le_bytes());
let rlen = alleles.first().map(|a| a.len() as i32).unwrap_or(0);
shared.extend_from_slice(&rlen.to_le_bytes());
shared.extend_from_slice(&qual.to_le_bytes());
let n_allele = alleles.len() as u32;
let n_info_allele = (n_allele << 16) | 0;
shared.extend_from_slice(&n_info_allele.to_le_bytes());
shared.extend_from_slice(&0u32.to_le_bytes());
if id.is_empty() || id == "." {
shared.push((0 << 4) | 7); } else {
shared.extend_from_slice(&encode_typed_string(id));
}
for allele in alleles {
shared.extend_from_slice(&encode_typed_string(allele));
}
shared.extend_from_slice(&encode_typed_int8_vec(filter_indices));
let l_shared = shared.len() as u32;
let l_indiv = 0u32;
let mut record = Vec::new();
record.extend_from_slice(&l_shared.to_le_bytes());
record.extend_from_slice(&l_indiv.to_le_bytes());
record.extend_from_slice(&shared);
record
}
pub fn build_bcf_content(
contigs: &[&str],
filter_defs: &[&str],
records_data: &[Vec<u8>],
) -> Vec<u8> {
let mut data = Vec::new();
data.extend_from_slice(b"BCF\x02\x01");
let mut header = String::new();
header.push_str("##fileformat=VCFv4.3\n");
for (i, contig) in contigs.iter().enumerate() {
header.push_str(&format!("##contig=<ID={},length={}>\n", contig, 1000 * (i + 1)));
}
for filter_name in filter_defs {
header.push_str(&format!("##FILTER=<ID={},Description=\"{}\">\n", filter_name, filter_name));
}
header.push_str("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
let header_bytes = header.as_bytes();
let header_len = header_bytes.len() + 1; data.extend_from_slice(&(header_len as u32).to_le_bytes());
data.extend_from_slice(header_bytes);
data.push(0);
for rec in records_data {
data.extend_from_slice(rec);
}
data
}
pub fn write_test_bcf(
contigs: &[&str],
filter_defs: &[&str],
records_data: &[Vec<u8>],
) -> tempfile::NamedTempFile {
let content = build_bcf_content(contigs, filter_defs, records_data);
let compressed = bgzf_compress(&content);
let eof = bgzf_eof_block();
let mut file = tempfile::NamedTempFile::with_suffix(".bcf").unwrap();
file.write_all(&compressed).unwrap();
file.write_all(&eof).unwrap();
file.flush().unwrap();
file
}
}
#[cfg(test)]
mod tests {
use super::test_helpers::*;
use super::*;
#[test]
fn bcf_parse_simple() {
let records = vec![
build_bcf_record(0, 99, 30.0, "rs123", &["A", "G"], &[0]), build_bcf_record(0, 199, f32::NAN, ".", &["AC", "A"], &[]), build_bcf_record(1, 299, 50.5, ".", &["T", "TA", "TG"], &[1]), ];
let file = write_test_bcf(
&["chr1", "chr2"],
&["PASS", "LowQual"],
&records,
);
let variants = parse_bcf(file.path()).unwrap();
assert_eq!(variants.len(), 3);
assert_eq!(variants[0].chrom, "chr1");
assert_eq!(variants[0].position, 100); assert_eq!(variants[0].id, Some("rs123".to_string()));
assert_eq!(variants[0].ref_allele, b"A");
assert_eq!(variants[0].alt_alleles, vec![b"G".to_vec()]);
assert!((variants[0].quality.unwrap() - 30.0).abs() < 0.1);
assert_eq!(variants[0].filter, VariantFilter::Pass);
assert_eq!(variants[1].position, 200);
assert_eq!(variants[1].id, None);
assert_eq!(variants[1].quality, None);
assert_eq!(variants[2].chrom, "chr2");
assert_eq!(variants[2].alt_alleles.len(), 2);
}
#[test]
fn bcf_matches_vcf() {
let records = vec![
build_bcf_record(0, 99, 30.0, ".", &["A", "G"], &[0]),
];
let file = write_test_bcf(&["chr1"], &["PASS"], &records);
let variants = parse_bcf(file.path()).unwrap();
assert_eq!(variants[0].chrom, "chr1");
assert_eq!(variants[0].position, 100);
assert_eq!(variants[0].ref_allele, b"A");
assert_eq!(variants[0].alt_alleles, vec![b"G".to_vec()]);
assert_eq!(variants[0].filter, VariantFilter::Pass);
}
#[test]
fn bcf_stats_computed() {
let records = vec![
build_bcf_record(0, 99, 30.0, ".", &["A", "G"], &[0]),
build_bcf_record(0, 199, 40.0, ".", &["AC", "A"], &[0]),
build_bcf_record(1, 299, 50.0, ".", &["T", "C"], &[1]),
];
let file = write_test_bcf(&["chr1", "chr2"], &["PASS", "LowQual"], &records);
let stats = bcf_stats(file.path()).unwrap();
assert_eq!(stats.variant_count, 3);
assert_eq!(stats.snv_count, 2);
assert_eq!(stats.indel_count, 1);
assert_eq!(stats.pass_count, 2);
assert_eq!(stats.chromosomes, vec!["chr1", "chr2"]);
}
#[test]
fn bcf_invalid_magic() {
let mut bad_data = vec![0u8; 100];
bad_data[0..5].copy_from_slice(b"XXXXX");
let compressed = bgzf_compress(&bad_data);
let eof = bgzf_eof_block();
let mut file = tempfile::NamedTempFile::with_suffix(".bcf").unwrap();
use std::io::Write;
file.write_all(&compressed).unwrap();
file.write_all(&eof).unwrap();
file.flush().unwrap();
let result = parse_bcf(file.path());
assert!(result.is_err());
let err_msg = format!("{}", result.unwrap_err());
assert!(err_msg.contains("not a valid BCF"));
}
#[test]
fn bcf_multiallelic() {
let records = vec![
build_bcf_record(0, 99, 30.0, ".", &["A", "G", "T", "C"], &[0]),
];
let file = write_test_bcf(&["chr1"], &["PASS"], &records);
let variants = parse_bcf(file.path()).unwrap();
assert_eq!(variants[0].alt_alleles.len(), 3);
assert_eq!(variants[0].alt_alleles[0], b"G");
assert_eq!(variants[0].alt_alleles[1], b"T");
assert_eq!(variants[0].alt_alleles[2], b"C");
}
}