use super::*;
use crate::{Subgraph, SubgraphQuery, HaplotypeOutput};
use crate::GAFBaseParams;
use crate::internal;
fn gaf_base_subgraph_queries() -> Vec<(SubgraphQuery, String)> {
vec![
(SubgraphQuery::nodes(vec![]).with_context(100).with_snarls(false).with_output(HaplotypeOutput::All), String::from("empty")),
(SubgraphQuery::nodes(vec![1000]).with_context(100).with_snarls(false).with_output(HaplotypeOutput::All), String::from("single component")),
(SubgraphQuery::nodes(vec![500, 1500]).with_context(100).with_snarls(false).with_output(HaplotypeOutput::All), String::from("two components")),
]
}
fn validate_read_set(read_set: &ReadSet, subgraph: &Subgraph, all_reads: &[Alignment], name: &str, output: AlignmentOutput, params: &GAFBaseParams) {
let mut expected_reads = Vec::new();
let mut unclipped = 0;
for aln in all_reads {
let target_path = aln.target_path().unwrap();
if target_path.is_empty() {
continue;
}
let mut take = false;
for handle in target_path {
if subgraph.has_handle(*handle) {
take = true;
} else if output == AlignmentOutput::Contained {
take = false;
break;
}
}
if take {
let mut aln = aln.clone();
if !params.store_quality_strings {
aln.base_quality.clear();
}
if !params.store_optional_fields {
aln.optional.clear();
}
if output == AlignmentOutput::Clipped {
let sequence_len = Arc::new(|handle| {
let record = read_set.nodes.get(&handle)?;
Some(record.sequence().len())
});
let clipped = aln.clip(subgraph, sequence_len);
if let Err(err) = clipped {
panic!("Failed to clip an overlapping read for {}: {}", name, err);
}
let clipped = clipped.unwrap();
for fragment in clipped.into_iter() {
expected_reads.push(fragment);
}
} else {
expected_reads.push(aln);
}
unclipped += 1;
}
}
assert_eq!(read_set.len(), expected_reads.len(), "Wrong number of {} reads in read set for {}", output, name);
assert_eq!(read_set.unclipped(), unclipped, "Wrong number of unclipped {} reads in read set for {}", output, name);
for (i, (aln, truth)) in read_set.iter().zip(expected_reads.iter()).enumerate() {
assert_eq!(aln, truth, "Wrong {} read {} for {}", output, i, name);
}
let mut expected_gaf = Vec::new();
for aln in expected_reads {
let mut target_sequence = Vec::new();
let target_path = aln.target_path().unwrap();
for handle in target_path {
let record = read_set.nodes.get(handle);
assert!(record.is_some(), "Missing GBZ record for handle {} in {}", handle, name);
let record = record.unwrap();
target_sequence.extend_from_slice(record.sequence());
}
let gaf_line = aln.to_gaf(&target_sequence);
expected_gaf.extend_from_slice(&gaf_line);
expected_gaf.push(b'\n');
}
let mut gaf_output = Vec::new();
let result = read_set.to_gaf(&mut gaf_output);
assert!(result.is_ok(), "Failed to output {} GAF for {}: {}", output, name, result.unwrap_err());
assert_eq!(gaf_output.len(), expected_gaf.len(), "Wrong {} GAF output length for {}", output, name);
assert_eq!(gaf_output, expected_gaf, "Wrong {} GAF output for {}", output, name);
}
#[test]
fn read_set_default() {
let read_set = ReadSet::default();
assert_eq!(read_set.len(), 0, "Default ReadSet has non-zero length");
assert_eq!(read_set.unclipped(), 0, "Default ReadSet has non-zero unclipped count");
assert_eq!(read_set.blocks(), 0, "Default ReadSet has non-zero block count");
assert_eq!(read_set.clusters(), 0, "Default ReadSet has non-zero cluster count");
assert!(read_set.iter().next().is_none(), "Default ReadSet iterator is not empty");
}
fn test_read_set_gbz(gbwt_part: &'static str, params: &GAFBaseParams) {
let graph = internal::load_gaf_base_gbz();
let graph_ref = if params.reference_free {
GraphReference::Gbz(&graph)
} else {
GraphReference::None
};
let gaf_base_file = internal::create_gaf_base_with_params("micb-kir3dl1_HG003.gaf", gbwt_part, graph_ref, params);
let gaf_base = internal::open_gaf_base(&gaf_base_file);
let all_reads = internal::load_gaf_base_reads(false);
let queries = gaf_base_subgraph_queries();
for (query, name) in queries {
let mut subgraph = Subgraph::new();
let result = subgraph.from_gbz(&graph, None, None, &query);
assert!(result.is_ok(), "Failed to extract subgraph for {}: {}", name, result.unwrap_err());
for output in [AlignmentOutput::Overlapping, AlignmentOutput::Clipped, AlignmentOutput::Contained] {
let graph_ref = if params.reference_free {
GraphReference::None
} else {
GraphReference::Gbz(&graph)
};
let read_set = ReadSet::new(graph_ref, &subgraph, &gaf_base, output);
assert!(read_set.is_ok(), "Failed to extract {} reads for {}: {}", output, name, read_set.unwrap_err());
let read_set = read_set.unwrap();
validate_read_set(&read_set, &subgraph, &all_reads, &name, output, params);
}
}
drop(gaf_base);
let _ = std::fs::remove_file(&gaf_base_file);
}
#[test]
fn read_set_gbz_unidirectional() {
test_read_set_gbz("micb-kir3dl1_HG003.gbwt", &GAFBaseParams::default());
}
#[test]
fn read_set_gbz_bidirectional() {
test_read_set_gbz("bidirectional.gbwt", &GAFBaseParams::default());
}
#[test]
fn read_set_gbz_no_quality() {
let mut params = GAFBaseParams::default();
params.store_quality_strings = false;
test_read_set_gbz("micb-kir3dl1_HG003.gbwt", ¶ms);
}
#[test]
fn read_set_gbz_no_optional() {
let mut params = GAFBaseParams::default();
params.store_optional_fields = false;
test_read_set_gbz("micb-kir3dl1_HG003.gbwt", ¶ms);
}
#[test]
fn read_set_gbz_ref_free() {
let mut params = GAFBaseParams::default();
params.reference_free = true;
test_read_set_gbz("micb-kir3dl1_HG003.gbwt", ¶ms);
}
fn test_read_set_db(gbwt_part: &'static str, params: &GAFBaseParams) {
let (gbz_base, gbz_base_file) = internal::create_gaf_base_db();
let mut graph = internal::create_graph_interface(&gbz_base);
let graph_ref = if params.reference_free {
GraphReference::Db(&mut graph)
} else {
GraphReference::None
};
let gaf_base_file = internal::create_gaf_base_with_params("micb-kir3dl1_HG003.gaf", gbwt_part, graph_ref, params);
let gaf_base = internal::open_gaf_base(&gaf_base_file);
let all_reads = internal::load_gaf_base_reads(false);
let queries = gaf_base_subgraph_queries();
for (query, name) in queries {
let mut subgraph = Subgraph::new();
let result = subgraph.from_db(&mut graph, &query);
assert!(result.is_ok(), "Failed to extract subgraph for {}: {}", name, result.unwrap_err());
for output in [AlignmentOutput::Overlapping, AlignmentOutput::Clipped, AlignmentOutput::Contained] {
let graph_ref = if params.reference_free {
GraphReference::None
} else {
GraphReference::Db(&mut graph)
};
let read_set = ReadSet::new(graph_ref, &subgraph, &gaf_base, output);
assert!(read_set.is_ok(), "Failed to extract {} reads for {}: {}", output, name, read_set.unwrap_err());
let read_set = read_set.unwrap();
validate_read_set(&read_set, &subgraph, &all_reads, &name, output, params);
}
}
drop(graph);
drop(gbz_base);
let _ = std::fs::remove_file(&gbz_base_file);
drop(gaf_base);
let _ = std::fs::remove_file(&gaf_base_file);
}
#[test]
fn read_set_db_unidirectional() {
test_read_set_db("micb-kir3dl1_HG003.gbwt", &GAFBaseParams::default());
}
#[test]
fn read_set_db_bidirectional() {
test_read_set_db("bidirectional.gbwt", &GAFBaseParams::default());
}
#[test]
fn read_set_db_no_quality() {
let mut params = GAFBaseParams::default();
params.store_quality_strings = false;
test_read_set_db("micb-kir3dl1_HG003.gbwt", ¶ms);
}
#[test]
fn read_set_db_no_optional() {
let mut params = GAFBaseParams::default();
params.store_optional_fields = false;
test_read_set_db("micb-kir3dl1_HG003.gbwt", ¶ms);
}
#[test]
fn read_set_db_ref_free() {
let mut params = GAFBaseParams::default();
params.reference_free = true;
test_read_set_db("micb-kir3dl1_HG003.gbwt", ¶ms);
}
#[test]
fn read_set_from_rows() {
let graph = internal::load_gaf_base_gbz();
let gaf_base_file = internal::create_gaf_base("micb-kir3dl1_HG003.gaf", "micb-kir3dl1_HG003.gbwt");
let gaf_base = internal::open_gaf_base(&gaf_base_file);
let all_reads = internal::load_gaf_base_reads(false);
let chunk_sizes = vec![1, 2, 5];
for chunk_size in chunk_sizes {
let mut found_alns = 0;
let mut found_blocks = 0;
let mut rowid = 1; while found_alns < gaf_base.alignments() {
let range = rowid..(rowid + chunk_size);
let read_set = ReadSet::from_rows(&gaf_base, range.clone(), Some(&graph));
assert!(read_set.is_ok(), "Failed to extract reads from rows {}..{}: {}", range.start, range.end, read_set.unwrap_err());
let read_set = read_set.unwrap();
assert_eq!(read_set.len(), read_set.unclipped(), "Extracted clipped alignments from rows {}..{}", range.start, range.end);
assert!(found_alns + read_set.len() <= all_reads.len(), "Extracted too many alignments from rows {}..{}", range.start, range.end);
for (i, aln) in read_set.iter().enumerate() {
let truth = &all_reads[found_alns + i];
assert_eq!(aln, truth, "Wrong read {} from rows {}..{}", i, range.start, range.end);
}
found_alns += read_set.len();
found_blocks += read_set.blocks();
rowid += chunk_size;
}
assert_eq!(found_alns, gaf_base.alignments(), "Wrong total number of alignments with chunk size {}", chunk_size);
assert_eq!(found_blocks, gaf_base.blocks(), "Wrong total number of extracted with chunk size {}", chunk_size);
}
drop(gaf_base);
let _ = std::fs::remove_file(&gaf_base_file);
}