use crate::{GBZRecord, GBZPath};
use crate::{GraphInterface, GraphReference};
use crate::{PathIndex, Chains};
use crate::{SubgraphQuery, HaplotypeOutput};
use crate::subgraph::query::QueryType;
use crate::formats::{self, WalkMetadata, JSONValue};
use std::cmp::Reverse;
use std::collections::{BinaryHeap, BTreeMap, BTreeSet, HashSet};
use std::fmt::Display;
use std::io::{self, Write};
use std::iter::FusedIterator;
use std::ops::Range;
use std::cmp;
use gbz::ENDMARKER;
use gbz::{GBZ, GraphPosition, Orientation, Pos, FullPathName};
use gbz::{algorithms, support};
use pggname::{Graph, GraphName};
use pggname::graph::NodeInt;
#[cfg(test)]
mod tests;
pub mod query;
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct Subgraph {
records: BTreeMap<usize, GBZRecord>,
paths: Vec<PathInfo>,
ref_id: Option<usize>,
ref_path: Option<FullPathName>,
ref_interval: Option<Range<usize>>,
graph_name: Option<GraphName>,
}
impl Subgraph {
pub fn new() -> Self {
Subgraph::default()
}
pub fn path_pos_from_gbz(
&mut self,
graph: &GBZ,
path_index: &PathIndex,
query_pos: &FullPathName
) -> Result<(PathPosition, FullPathName), String> {
let path = GBZPath::with_name(graph, query_pos).ok_or(
format!("Cannot find a path covering {}", query_pos)
)?;
let query_offset = query_pos.fragment - path.name.fragment;
let index_offset = path_index.path_to_offset(path.handle).ok_or(
format!("Path {} has not been indexed for random access", path.name())
)?;
let (path_offset, pos) = path_index.indexed_position(index_offset, query_offset).unwrap();
self.find_path_pos(GraphReference::Gbz(graph), query_offset, path_offset, pos, path.name)
}
pub fn path_pos_from_db<'reference, 'graph>(
&mut self,
graph: &'reference mut GraphInterface<'graph>,
query_pos: &FullPathName
) -> Result<(PathPosition, FullPathName), String> {
let path = graph.find_path(query_pos)?;
let path = path.ok_or(format!("Cannot find a path covering {}", query_pos))?;
if !path.is_indexed {
return Err(format!("Path {} has not been indexed for random access", query_pos));
}
let query_offset = query_pos.fragment - path.name.fragment;
let result = graph.indexed_position(path.handle, query_offset)?;
let (path_offset, pos) = result.ok_or(
format!("Path {} has not been indexed for random access", path.name())
)?;
self.find_path_pos(GraphReference::Db(graph), query_offset, path_offset, pos, path.name)
}
fn find_path_pos(
&mut self,
graph: GraphReference<'_, '_>,
query_offset: usize,
path_offset: usize,
pos: Pos,
path_name: FullPathName
) -> Result<(PathPosition, FullPathName), String> {
let mut path_offset = path_offset;
let mut pos = pos;
let mut node_offset: Option<usize> = None;
let mut gbwt_pos: Option<Pos> = None;
let mut graph = graph;
loop {
let node_id = support::node_id(pos.node);
if !self.has_node(node_id) {
self.add_node_internal(&mut graph, node_id)?;
}
let record = self.record(pos.node).unwrap();
if path_offset + record.sequence_len() > query_offset {
node_offset = Some(query_offset - path_offset);
gbwt_pos = Some(pos);
break;
}
path_offset += record.sequence_len();
let next = record.to_gbwt_record().lf(pos.offset);
if next.is_none() {
break;
}
pos = next.unwrap();
}
let node_offset = node_offset.ok_or(
format!("Path {} does not contain offset {}", path_name, query_offset)
)?;
let gbwt_pos = gbwt_pos.unwrap();
let path_position = PathPosition {
seq_offset: query_offset,
gbwt_node: gbwt_pos.node,
node_offset,
gbwt_offset: gbwt_pos.offset,
};
Ok((path_position, path_name))
}
pub fn around_position(
&mut self,
graph: GraphReference<'_, '_>,
pos: GraphPosition,
context: usize
) -> Result<(usize, usize), String> {
let mut graph = graph;
if !self.has_node(pos.node) {
self.add_node_internal(&mut graph, pos.node)?;
}
let handle = pos.to_gbwt();
let record = self.record(handle).unwrap();
let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
let start = (pos.node, NodeSide::start(pos.orientation));
active.push(Reverse((pos.offset, start)));
let end_distance = record.sequence_len() - pos.offset - 1;
let end = (pos.node, NodeSide::end(pos.orientation));
active.push(Reverse((end_distance, end)));
self.insert_context(graph, active, context)
}
pub fn around_interval(
&mut self,
graph: GraphReference<'_, '_>,
start_pos: PathPosition,
len: usize,
context: usize
) -> Result<(usize, usize), String> {
if len == 0 {
return Err(String::from("Interval length must be greater than 0"));
}
let mut pos = start_pos.gbwt_pos();
let mut offset = start_pos.node_offset();
let mut len = len;
let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
let mut graph = graph;
loop {
let node_id = support::node_id(pos.node);
let orientation = support::node_orientation(pos.node);
if !self.has_node(node_id) {
self.add_node_internal(&mut graph, node_id)?;
}
let record = self.record(pos.node).unwrap();
if offset >= record.sequence_len() {
return Err(format!("Offset {} in node {} of length {}", offset, node_id, record.sequence_len()));
}
let start = (node_id, NodeSide::start(orientation));
active.push(Reverse((offset, start)));
let end = (node_id, NodeSide::end(orientation));
let distance_to_next = record.sequence_len() - offset;
if len <= distance_to_next {
let end_distance = if len == distance_to_next { 0 } else { distance_to_next - len - 1 };
active.push(Reverse((end_distance, end)));
break;
} else {
active.push(Reverse((0, end)));
}
if let Some(next) = record.to_gbwt_record().lf(pos.offset) {
pos = next;
} else {
return Err(format!("No successor for GBWT position ({}, {})", pos.node, pos.offset));
}
offset = 0;
len -= distance_to_next;
}
self.insert_context(graph, active, context)
}
pub fn around_nodes(
&mut self,
graph: GraphReference<'_, '_>,
nodes: &BTreeSet<usize>,
context: usize
) -> Result<(usize, usize), String> {
let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
for &node_id in nodes {
let start = (node_id, NodeSide::Left);
active.push(Reverse((0, start)));
let end = (node_id, NodeSide::Right);
active.push(Reverse((0, end)));
}
self.insert_context(graph, active, context)
}
fn insert_context(
&mut self,
graph: GraphReference<'_, '_>,
active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>>,
context: usize
) -> Result<(usize, usize), String> {
self.clear_paths();
let mut active = active;
let mut visited: BTreeSet<(usize, NodeSide)> = BTreeSet::new();
let mut to_remove: BTreeSet<usize> = self.node_iter().collect();
let mut inserted = 0;
let mut graph = graph;
while !active.is_empty() {
let (distance, node_side) = active.pop().unwrap().0;
if visited.contains(&node_side) {
continue;
}
visited.insert(node_side);
to_remove.remove(&node_side.0);
if !self.has_node(node_side.0) {
self.add_node_internal(&mut graph, node_side.0)?;
inserted += 1;
}
let other_side = (node_side.0, node_side.1.flip());
if !visited.contains(&other_side) {
let handle = support::encode_node(node_side.0, node_side.1.as_start());
let record = self.record(handle).unwrap();
let next_distance = distance + record.sequence_len() - 1;
if next_distance <= context {
active.push(Reverse((next_distance, other_side)));
}
}
let handle = support::encode_node(node_side.0, node_side.1.as_end());
let record = self.record(handle).unwrap();
let next_distance = distance + 1;
if next_distance <= context {
for successor in record.successors() {
let node_id = support::node_id(successor);
let side = NodeSide::start(support::node_orientation(successor));
if !visited.contains(&(node_id, side)) {
active.push(Reverse((next_distance, (node_id, side))));
}
}
}
}
let removed = to_remove.len();
for node_id in to_remove {
self.remove_node_internal(node_id);
}
Ok((inserted, removed))
}
pub fn between_nodes(&mut self, graph: GraphReference<'_, '_>, start: usize, end: usize, limit: Option<usize>) -> Result<usize, String> {
self.clear_paths();
let mut active = vec![start, support::flip_node(end)];
let mut visited = HashSet::new();
visited.insert(support::decode_node(start).0);
visited.insert(support::decode_node(end).0);
let mut graph = graph;
let mut inserted = 0;
while let Some(curr) = active.pop() {
let (node_id, orientation) = support::decode_node(curr);
if !self.has_node(node_id) {
if let Some(limit) = limit && inserted >= limit {
let (start_id, start_o) = support::decode_node(start);
let (end_id, end_o) = support::decode_node(end);
return Err(format!("Found more than {} new nodes between ({}, {}) and ({}, {})", limit, start_id, start_o, end_id, end_o));
}
self.add_node_internal(&mut graph, node_id)?;
inserted += 1;
}
for (next_id, next_o) in self.supergraph_successors(node_id, orientation).unwrap() {
if visited.contains(&next_id) {
continue;
}
let fw_handle = support::encode_node(next_id, next_o);
let rev_handle = support::flip_node(fw_handle);
active.push(fw_handle); active.push(rev_handle);
visited.insert(next_id);
}
}
Ok(inserted)
}
pub fn extract_snarls(&mut self, graph: GraphReference<'_, '_>, chains: Option<&Chains>) -> Result<usize, String> {
let snarls = self.covered_snarls(chains);
let mut inserted = 0;
let mut graph = graph;
for (start, end) in snarls {
match &mut graph {
GraphReference::Gbz(graph) => {
inserted += self.between_nodes(GraphReference::Gbz(graph), start, end, None)?;
}
GraphReference::Db(graph) => {
inserted += self.between_nodes(GraphReference::Db(graph), start, end, None)?;
},
GraphReference::None => {
return Err(String::from("No graph reference provided"));
}
}
}
Ok(inserted)
}
pub fn from_gbz(&mut self, graph: &GBZ, path_index: Option<&PathIndex>, chains: Option<&Chains>, query: &SubgraphQuery) -> Result<(), String> {
if query.snarls() && chains.is_none() {
return Err(String::from("Top-level chains are required for extracting snarls"));
}
match query.query_type() {
QueryType::PathOffset(query_pos) => {
let path_index = path_index.ok_or(
String::from("Path index is required for path-based queries")
)?;
let reference_path = self.path_pos_from_gbz(graph, path_index, query_pos)?;
self.around_position(GraphReference::Gbz(graph), reference_path.0.graph_pos(), query.context())?;
if query.snarls() {
self.extract_snarls(GraphReference::Gbz(graph), chains)?;
}
self.extract_paths(Some(reference_path), query.output())?;
},
QueryType::PathInterval(query_pos, len) => {
let path_index = path_index.ok_or(
String::from("Path index is required for path-based queries")
)?;
let reference_path = self.path_pos_from_gbz(graph, path_index, query_pos)?;
self.around_interval(GraphReference::Gbz(graph), reference_path.0, *len, query.context())?;
if query.snarls() {
self.extract_snarls(GraphReference::Gbz(graph), chains)?;
}
self.extract_paths(Some(reference_path), query.output())?;
},
QueryType::Nodes(nodes) => {
if query.output() == HaplotypeOutput::ReferenceOnly {
return Err(String::from("Cannot output a reference path in a node-based query"));
}
self.around_nodes(GraphReference::Gbz(graph), nodes, query.context())?;
if query.snarls() {
self.extract_snarls(GraphReference::Gbz(graph), chains)?;
}
self.extract_paths(None, query.output())?;
},
QueryType::Between((start, end), limit) => {
if query.output() == HaplotypeOutput::ReferenceOnly {
return Err(String::from("Cannot output a reference path in a node-based query"));
}
self.between_nodes(GraphReference::Gbz(graph), *start, *end, *limit)?;
self.extract_paths(None, query.output())?;
},
}
let parent = GraphName::from_gbz(graph);
self.compute_name(Some(&parent));
Ok(())
}
pub fn from_db<'reference, 'graph>(&mut self, graph: &'reference mut GraphInterface<'graph>, query: &SubgraphQuery) -> Result<(), String> {
match query.query_type() {
QueryType::PathOffset(query_pos) => {
let reference_path = self.path_pos_from_db(graph, query_pos)?;
self.around_position(GraphReference::Db(graph), reference_path.0.graph_pos(), query.context())?;
if query.snarls() {
self.extract_snarls(GraphReference::Db(graph), None)?;
}
self.extract_paths(Some(reference_path), query.output())?;
},
QueryType::PathInterval(query_pos, len) => {
let reference_path = self.path_pos_from_db(graph, query_pos)?;
self.around_interval(GraphReference::Db(graph), reference_path.0, *len, query.context())?;
if query.snarls() {
self.extract_snarls(GraphReference::Db(graph), None)?;
}
self.extract_paths(Some(reference_path), query.output())?;
},
QueryType::Nodes(nodes) => {
if query.output() == HaplotypeOutput::ReferenceOnly {
return Err(String::from("Cannot output a reference path in a node-based query"));
}
self.around_nodes(GraphReference::Db(graph), nodes, query.context())?;
if query.snarls() {
self.extract_snarls(GraphReference::Db(graph), None)?;
}
self.extract_paths(None, query.output())?;
},
QueryType::Between((start, end), limit) => {
if query.output() == HaplotypeOutput::ReferenceOnly {
return Err(String::from("Cannot output a reference path in a node-based query"));
}
self.between_nodes(GraphReference::Db(graph), *start, *end, *limit)?;
self.extract_paths(None, query.output())?;
},
}
let parent = graph.graph_name()?;
self.compute_name(Some(&parent));
Ok(())
}
fn next_pos(pos: Pos, successors: &BTreeMap<usize, Vec<(Pos, bool)>>) -> Option<Pos> {
if let Some(v) = successors.get(&pos.node) {
let (next, _) = v[pos.offset];
if next.node == ENDMARKER || !successors.contains_key(&next.node) {
None
} else {
Some(next)
}
} else {
None
}
}
pub fn extract_paths(
&mut self,
reference_path: Option<(PathPosition, FullPathName)>,
output: HaplotypeOutput
) -> Result<(), String> {
self.clear_paths();
let ref_pos;
if let Some((position, name)) = reference_path {
ref_pos = Some(position);
self.ref_path = Some(name);
} else {
ref_pos = None;
}
let mut keys: Vec<usize> = Vec::new();
let mut successors: BTreeMap<usize, Vec<(Pos, bool)>> = BTreeMap::new();
for (handle, gbz_record) in self.records.iter() {
let gbwt_record = gbz_record.to_gbwt_record();
let decompressed: Vec<(Pos, bool)> = gbwt_record.decompress().into_iter().map(|x| (x, false)).collect();
keys.push(*handle);
successors.insert(*handle, decompressed);
}
for handle in keys.iter() {
let decompressed = successors.get(handle).unwrap().clone();
for (pos, _) in decompressed.iter() {
if let Some(v) = successors.get_mut(&pos.node) {
v[pos.offset].1 = true;
}
}
}
let mut ref_offset: Option<usize> = None;
for (handle, positions) in successors.iter() {
for (offset, (_, has_predecessor)) in positions.iter().enumerate() {
if *has_predecessor {
continue;
}
let mut curr = Some(Pos::new(*handle, offset));
let mut is_ref = false;
let mut path: Vec<usize> = Vec::new();
let mut len = 0;
while let Some(pos) = curr {
if let Some(position) = ref_pos.as_ref() && pos == position.gbwt_pos() {
self.ref_id = Some(self.paths.len());
ref_offset = Some(path.len());
is_ref = true;
}
path.push(pos.node);
len += self.records.get(&pos.node).unwrap().sequence_len();
curr = Self::next_pos(pos, &successors);
}
if is_ref {
if !support::encoded_path_is_canonical(&path) {
eprintln!("Warning: the reference path is not in canonical orientation");
}
self.paths.push(PathInfo::new(path, len));
} else if support::encoded_path_is_canonical(&path) {
self.paths.push(PathInfo::new(path, len));
}
}
}
if let Some(ref_pos) = ref_pos {
if let Some(offset) = ref_offset {
let ref_info = &self.paths[self.ref_id.unwrap()];
self.ref_interval = Some(ref_info.path_interval(self, offset, &ref_pos));
} else {
self.clear_paths();
return Err(String::from("Could not find the reference path"));
}
}
if output == HaplotypeOutput::Distinct {
self.distinct_paths();
} else if output == HaplotypeOutput::ReferenceOnly {
self.reference_only()?;
}
Ok(())
}
fn distinct_paths(&mut self) {
let ref_path = self.ref_id.map(|id| self.paths[id].path.clone());
self.paths.sort_unstable();
let mut new_paths: Vec<PathInfo> = Vec::new();
let mut ref_id = None;
for info in self.paths.iter() {
if new_paths.is_empty() || new_paths.last().unwrap().path != info.path {
if let Some(ref_path) = &ref_path && info.path == *ref_path {
ref_id = Some(new_paths.len());
}
new_paths.push(PathInfo::weighted(info.path.clone(), info.len));
} else {
new_paths.last_mut().unwrap().increment_weight();
}
}
self.paths = new_paths;
self.ref_id = ref_id;
}
fn reference_only(&mut self) -> Result<(), String> {
if self.ref_id.is_none() {
return Err(String::from("Reference path is required for reference-only output"));
}
let ref_id = self.ref_id.unwrap();
let ref_info = self.paths[ref_id].clone();
self.paths = vec![ref_info];
self.ref_id = Some(0);
Ok(())
}
fn add_node_internal(&mut self, graph: &mut GraphReference<'_, '_>, node_id: usize) -> Result<(), String> {
let forward = graph.gbz_record(support::encode_node(node_id, Orientation::Forward))?;
let reverse = graph.gbz_record(support::encode_node(node_id, Orientation::Reverse))?;
self.records.insert(forward.handle(), forward);
self.records.insert(reverse.handle(), reverse);
Ok(())
}
pub fn add_node_from_gbz(&mut self, graph: &GBZ, node_id: usize) -> Result<(), String> {
if self.has_node(node_id) {
return Ok(());
}
self.clear_paths();
self.add_node_internal(&mut GraphReference::Gbz(graph), node_id)
}
pub fn add_node_from_db(&mut self, graph: &mut GraphInterface, node_id: usize) -> Result<(), String> {
if self.has_node(node_id) {
return Ok(());
}
self.clear_paths();
self.add_node_internal(&mut GraphReference::Db(graph), node_id)
}
fn remove_node_internal(&mut self, node_id: usize) {
self.records.remove(&support::encode_node(node_id, Orientation::Forward));
self.records.remove(&support::encode_node(node_id, Orientation::Reverse));
}
pub fn remove_node(&mut self, node_id: usize) {
if !self.has_node(node_id) {
return;
}
self.clear_paths();
self.remove_node_internal(node_id);
}
fn clear_paths(&mut self) {
self.paths.clear();
self.ref_id = None;
self.ref_path = None;
self.ref_interval = None;
}
}
impl Subgraph {
#[inline]
pub fn nodes(&self) -> usize {
self.records.len() / 2
}
#[inline]
pub fn min_node(&self) -> Option<usize> {
self.records.keys().next().map(|&handle| support::node_id(handle))
}
#[inline]
pub fn max_node(&self) -> Option<usize> {
self.records.keys().next_back().map(|&handle| support::node_id(handle))
}
#[inline]
pub fn min_handle(&self) -> Option<usize> {
self.records.keys().next().copied()
}
#[inline]
pub fn max_handle(&self) -> Option<usize> {
self.records.keys().next_back().copied()
}
#[inline]
pub fn has_node(&self, node_id: usize) -> bool {
self.records.contains_key(&support::encode_node(node_id, Orientation::Forward))
}
#[inline]
pub fn has_handle(&self, handle: usize) -> bool {
self.records.contains_key(&handle)
}
#[inline]
pub fn sequence_for_handle(&self, handle: usize) -> Option<&[u8]> {
self.records.get(&handle).map(|record| record.sequence())
}
#[inline]
pub fn sequence(&self, node_id: usize) -> Option<&[u8]> {
self.sequence_for_handle(support::encode_node(node_id, Orientation::Forward))
}
#[inline]
pub fn oriented_sequence(&self, node_id: usize, orientation: Orientation) -> Option<&[u8]> {
self.sequence_for_handle(support::encode_node(node_id, orientation))
}
#[inline]
pub fn sequence_len(&self, node_id: usize) -> Option<usize> {
self.records.get(&support::encode_node(node_id, Orientation::Forward)).map(|record| record.sequence_len())
}
pub fn node_iter(&'_ self) -> impl Iterator<Item = usize> + use<'_> {
self.records.keys().step_by(2).map(|&handle| support::node_id(handle))
}
pub fn handle_iter(&'_ self) -> impl Iterator<Item = usize> + use<'_> {
self.records.keys().copied()
}
pub fn successors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
let handle = support::encode_node(node_id, orientation);
let record = self.records.get(&handle)?;
Some(EdgeIter::new(self, record, false))
}
pub fn supergraph_successors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
let handle = support::encode_node(node_id, orientation);
let record = self.records.get(&handle)?;
Some(EdgeIter::in_supergraph(self, record, false))
}
pub fn predecessors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
let handle = support::encode_node(node_id, orientation.flip());
let record = self.records.get(&handle)?;
Some(EdgeIter::new(self, record, true))
}
pub fn supergraph_predecessors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
let handle = support::encode_node(node_id, orientation.flip());
let record = self.records.get(&handle)?;
Some(EdgeIter::in_supergraph(self, record, true))
}
#[inline]
fn record(&self, handle: usize) -> Option<&GBZRecord> {
self.records.get(&handle)
}
#[inline]
pub fn paths(&self) -> usize {
self.paths.len()
}
pub fn covered_snarls(&self, chains: Option<&Chains>) -> BTreeSet<(usize, usize)> {
let mut snarls: BTreeSet<(usize, usize)> = BTreeSet::new();
for (&handle, record) in self.records.iter() {
let next = if let Some(chains) = chains {
chains.next(handle)
} else {
record.next()
};
if next.is_none() {
continue;
}
let next = next.unwrap();
if !self.has_handle(next) {
continue;
}
if support::encoded_edge_is_canonical(handle, next) {
snarls.insert((handle, next));
}
}
snarls
}
}
impl Subgraph {
fn path_len(&self, path: &[usize]) -> usize {
let mut result = 0;
for handle in path.iter() {
let record = self.records.get(handle).unwrap();
result += record.sequence_len();
}
result
}
fn append_edit(edits: &mut Vec<(EditOperation, usize)>, op: EditOperation, len: usize) {
if len == 0 {
return;
}
if let Some((prev_op, prev_len)) = edits.last_mut() && *prev_op == op {
*prev_len += len;
return;
}
edits.push((op, len));
}
fn prefix_matches(&self, path: &[usize], ref_path: &[usize]) -> usize {
let mut result = 0;
let mut path_offset = 0;
let mut ref_offset = 0;
let mut path_base = 0;
let mut ref_base = 0;
while path_offset < path.len() && ref_offset < ref_path.len() {
let a = self.records.get(&path[path_offset]).unwrap().sequence();
let b = self.records.get(&ref_path[ref_offset]).unwrap().sequence();
while path_base < a.len() && ref_base < b.len() {
if a[path_base] != b[ref_base] {
return result;
}
path_base += 1;
ref_base += 1;
result += 1;
}
if path_base == a.len() {
path_offset += 1;
path_base = 0;
}
if ref_base == b.len() {
ref_offset += 1;
ref_base = 0;
}
}
result
}
fn suffix_matches(&self, path: &[usize], ref_path: &[usize]) -> usize {
let mut result = 0;
let mut path_offset = 0;
let mut ref_offset = 0;
let mut path_base = 0;
let mut ref_base = 0;
while path_offset < path.len() && ref_offset < ref_path.len() {
let a = self.records.get(&path[path.len() - path_offset - 1]).unwrap().sequence();
let b = self.records.get(&ref_path[ref_path.len() - ref_offset - 1]).unwrap().sequence();
while path_base < a.len() && ref_base < b.len() {
if a[a.len() - path_base - 1] != b[b.len() - ref_base - 1] {
return result;
}
path_base += 1;
ref_base += 1;
result += 1;
}
if path_base == a.len() {
path_offset += 1;
path_base = 0;
}
if ref_base == b.len() {
ref_offset += 1;
ref_base = 0;
}
}
result
}
fn mismatch_penalty(len: usize) -> usize {
4 * len
}
fn gap_penalty(len: usize) -> usize {
if len == 0 {
0
} else {
6 + (len - 1)
}
}
fn align(&self, path: &[usize], ref_path: &[usize], edits: &mut Vec<(EditOperation, usize)>) {
let path_len = self.path_len(path);
let ref_len = self.path_len(ref_path);
let prefix = self.prefix_matches(path, ref_path);
let mut suffix = self.suffix_matches(path, ref_path);
if prefix + suffix > path_len {
suffix = path_len - prefix;
}
if prefix + suffix > ref_len {
suffix = ref_len - prefix;
}
Self::append_edit(edits, EditOperation::Match, prefix);
let path_middle = path_len - prefix - suffix;
let ref_middle = ref_len - prefix - suffix;
if path_middle == 0 {
Self::append_edit(edits, EditOperation::Deletion, ref_middle);
} else if ref_middle == 0 {
Self::append_edit(edits, EditOperation::Insertion, path_middle);
} else {
let mismatch = cmp::min(path_middle, ref_middle);
let mismatch_indel =
Self::mismatch_penalty(mismatch) +
Self::gap_penalty(path_middle - mismatch) +
Self::gap_penalty(ref_middle - mismatch);
let insertion_deletion = Self::gap_penalty(path_middle) + Self::gap_penalty(ref_middle);
if mismatch_indel <= insertion_deletion {
Self::append_edit(edits, EditOperation::Match, mismatch);
Self::append_edit(edits, EditOperation::Insertion, path_middle - mismatch);
Self::append_edit(edits, EditOperation::Deletion, ref_middle - mismatch);
} else {
Self::append_edit(edits, EditOperation::Insertion, path_middle);
Self::append_edit(edits, EditOperation::Deletion, ref_middle);
}
}
Self::append_edit(edits, EditOperation::Match, suffix);
}
fn align_to_ref(&self, path_id: usize) -> Option<String> {
let ref_id = self.ref_id?;
if path_id == ref_id || path_id >= self.paths.len() {
return None;
}
let weight = &|handle: usize| -> usize {
self.records.get(&handle).unwrap().sequence_len()
};
let path = &self.paths[path_id].path;
let ref_path = &self.paths[ref_id].path;
let (lcs, _) = algorithms::fast_weighted_lcs(path, ref_path, weight);
let mut edits: Vec<(EditOperation, usize)> = Vec::new();
let mut path_offset = 0;
let mut ref_offset = 0;
for (next_path_offset, next_ref_offset) in lcs.iter() {
let path_interval = &path[path_offset..*next_path_offset];
let ref_interval = &ref_path[ref_offset..*next_ref_offset];
self.align(path_interval, ref_interval, &mut edits);
let node_len = self.records.get(&path[*next_path_offset]).unwrap().sequence_len();
Self::append_edit(&mut edits, EditOperation::Match, node_len);
path_offset = next_path_offset + 1;
ref_offset = next_ref_offset + 1;
}
self.align(&path[path_offset..], &ref_path[ref_offset..], &mut edits);
let mut result = String::new();
for (op, len) in edits.iter() {
result.push_str(&format!("{}{}", len, op));
}
Some(result)
}
}
impl Subgraph {
pub fn write_gfa<T: Write>(&self, output: &mut T, cigar: bool) -> io::Result<()> {
let reference_samples = self.ref_path.as_ref().map(|path| path.sample.as_ref());
formats::write_gfa_header(reference_samples, output)?;
if let Some(graph_name) = self.graph_name.as_ref() {
let header_lines = graph_name.to_gfa_header_lines();
formats::write_header_lines(&header_lines, output)?;
}
for (handle, record) in self.records.iter() {
if support::node_orientation(*handle) == Orientation::Forward {
formats::write_gfa_node(record.id(), record.sequence(), output)?;
}
}
for (handle, record) in self.records.iter() {
let from = support::decode_node(*handle);
for successor in record.successors() {
let to = support::decode_node(successor);
if self.has_handle(successor) && support::edge_is_canonical(from, to) {
formats::write_gfa_link(
(from.0.to_string().as_bytes(), from.1),
(to.0.to_string().as_bytes(), to.1),
output
)?;
}
}
}
if let Some((metadata, ref_id)) = self.ref_metadata() {
formats::write_gfa_walk(&self.paths[ref_id].path, &metadata, output)?;
}
let mut haplotype = 1;
let contig_name = self.contig_name();
for (id, path_info) in self.paths.iter().enumerate() {
if Some(id) == self.ref_id {
continue;
}
let mut metadata = WalkMetadata::anonymous(haplotype, &contig_name, path_info.len);
metadata.add_weight(path_info.weight);
if cigar {
metadata.add_cigar(self.align_to_ref(id));
}
formats::write_gfa_walk(&path_info.path, &metadata, output)?;
haplotype += 1;
}
Ok(())
}
pub fn write_json<T: Write>(&self, output: &mut T, cigar: bool) -> io::Result<()> {
let mut nodes: Vec<JSONValue> = Vec::new();
for (_, record) in self.records.iter() {
let (id, orientation) = support::decode_node(record.handle());
if orientation == Orientation::Reverse {
continue;
}
let node = JSONValue::Object(vec![
("id".to_string(), JSONValue::String(id.to_string())),
("sequence".to_string(), JSONValue::String(String::from_utf8_lossy(record.sequence()).to_string())),
]);
nodes.push(node);
}
let mut edges: Vec<JSONValue> = Vec::new();
for (handle, record) in self.records.iter() {
let from = support::decode_node(*handle);
for successor in record.successors() {
let to = support::decode_node(successor);
if self.has_handle(successor) && support::edge_is_canonical(from, to) {
let edge = JSONValue::Object(vec![
("from".to_string(), JSONValue::String(from.0.to_string())),
("from_is_reverse".to_string(), JSONValue::Boolean(from.1 == Orientation::Reverse)),
("to".to_string(), JSONValue::String(to.0.to_string())),
("to_is_reverse".to_string(), JSONValue::Boolean(to.1 == Orientation::Reverse)),
]);
edges.push(edge);
}
}
}
let mut paths: Vec<JSONValue> = Vec::new();
if let Some((metadata, ref_id)) = self.ref_metadata() {
let ref_path = formats::json_path(&self.paths[ref_id].path, &metadata);
paths.push(ref_path);
}
let mut haplotype = 1;
let contig_name = self.contig_name();
for (id, path_info) in self.paths.iter().enumerate() {
if Some(id) == self.ref_id {
continue;
}
let mut metadata = WalkMetadata::anonymous(haplotype, &contig_name, path_info.len);
metadata.add_weight(path_info.weight);
if cigar {
metadata.add_cigar(self.align_to_ref(id));
}
let path = formats::json_path(&path_info.path, &metadata);
paths.push(path);
haplotype += 1;
}
let result = JSONValue::Object(vec![
("nodes".to_string(), JSONValue::Array(nodes)),
("edges".to_string(), JSONValue::Array(edges)),
("paths".to_string(), JSONValue::Array(paths)),
]);
output.write_all(result.to_string().as_bytes())?;
Ok(())
}
fn ref_metadata(&self) -> Option<(WalkMetadata, usize)> {
let ref_id = self.ref_id?;
let ref_path = self.ref_path.as_ref()?;
let interval = self.ref_interval.as_ref()?.clone();
let mut metadata = WalkMetadata::path_interval(ref_path, interval);
metadata.add_weight(self.paths[ref_id].weight);
Some((metadata, ref_id))
}
fn contig_name(&self) -> String {
if let Some(ref_path) = self.ref_path.as_ref() {
ref_path.contig.clone()
} else {
String::from("unknown")
}
}
}
impl Subgraph {
pub fn graph_name(&self) -> Option<&GraphName> {
self.graph_name.as_ref()
}
pub fn has_graph_name(&self) -> bool {
self.graph_name.is_some()
}
pub fn compute_name(&mut self, parent: Option<&GraphName>) {
if self.has_graph_name() {
return;
}
let name = pggname::stable_name(self);
let mut result = GraphName::new(name);
if let Some(parent) = parent {
result.make_subgraph_of(parent);
}
self.graph_name = Some(result);
}
}
impl Graph for Subgraph {
fn new() -> Self {
unimplemented!();
}
fn add_node(&mut self, _: &[u8], _: &[u8]) -> Result<(), String> {
unimplemented!();
}
fn add_edge(&mut self, _: &[u8], _: Orientation, _: &[u8], _: Orientation) -> Result<(), String> {
unimplemented!();
}
fn finalize(&mut self) -> Result<(), String> {
Ok(())
}
fn statistics(&self) -> (usize, usize, usize) {
let node_count = self.nodes();
let mut edge_count = 0;
let mut seq_len = 0;
for (handle, record) in self.records.iter() {
let (from_id, from_o) = support::decode_node(*handle);
if from_o == Orientation::Forward {
seq_len += record.sequence_len();
}
for successor in record.successors() {
let (to_id, to_o) = support::decode_node(successor);
if self.has_node(to_id) && support::edge_is_canonical((from_id, from_o), (to_id, to_o)) {
edge_count += 1;
}
}
}
(node_count, edge_count, seq_len)
}
fn node_iter(&self) -> impl Iterator<Item=Vec<u8>> {
self.node_iter().map(|from_id| {
let sequence = self.sequence(from_id).unwrap().to_vec();
let mut node = NodeInt::new(Some(sequence));
for from_o in [Orientation::Forward, Orientation::Reverse] {
for (to_id, to_o) in self.successors(from_id, from_o).unwrap() {
if support::edge_is_canonical((from_id, from_o), (to_id, to_o)) {
node.edges.push((from_o, to_id, to_o));
}
}
}
node.finalize();
node.serialize(from_id)
})
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct PathPosition {
seq_offset: usize,
gbwt_node: usize,
node_offset: usize,
gbwt_offset: usize,
}
impl PathPosition {
#[inline]
pub fn seq_offset(&self) -> usize {
self.seq_offset
}
#[inline]
pub fn node_id(&self) -> usize {
support::node_id(self.gbwt_node)
}
#[inline]
pub fn orientation(&self) -> Orientation {
support::node_orientation(self.gbwt_node)
}
#[inline]
pub fn node_offset(&self) -> usize {
self.node_offset
}
#[inline]
pub fn graph_pos(&self) -> GraphPosition {
GraphPosition {
node: self.node_id(),
orientation: self.orientation(),
offset: self.node_offset(),
}
}
#[inline]
pub fn handle(&self) -> usize {
self.gbwt_node
}
#[inline]
pub fn gbwt_offset(&self) -> usize {
self.gbwt_offset
}
#[inline]
pub fn gbwt_pos(&self) -> Pos {
Pos {
node: self.handle(),
offset: self.gbwt_offset(),
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum NodeSide {
Left,
Right,
}
impl NodeSide {
fn flip(&self) -> NodeSide {
match self {
NodeSide::Left => NodeSide::Right,
NodeSide::Right => NodeSide::Left,
}
}
fn as_start(&self) -> Orientation {
match self {
NodeSide::Left => Orientation::Forward,
NodeSide::Right => Orientation::Reverse,
}
}
fn as_end(&self) -> Orientation {
match self {
NodeSide::Left => Orientation::Reverse,
NodeSide::Right => Orientation::Forward,
}
}
fn start(orientation: Orientation) -> NodeSide {
if orientation == Orientation::Forward {
NodeSide::Left
} else {
NodeSide::Right
}
}
fn end(orientation: Orientation) -> NodeSide {
if orientation == Orientation::Forward {
NodeSide::Right
} else {
NodeSide::Left
}
}
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
struct PathInfo {
path: Vec<usize>,
len: usize,
weight: Option<usize>,
}
impl PathInfo {
fn new(path: Vec<usize>, len: usize) -> Self {
PathInfo { path, len, weight: None }
}
fn weighted(path: Vec<usize>, len: usize) -> Self {
PathInfo { path, len, weight: Some(1) }
}
fn increment_weight(&mut self) {
if let Some(weight) = self.weight {
self.weight = Some(weight + 1);
}
}
fn path_interval(&self, subgraph: &Subgraph, path_offset: usize, path_pos: &PathPosition) -> Range<usize> {
let mut ref_pos = path_pos.node_offset();
for handle in self.path.iter().take(path_offset) {
ref_pos += subgraph.records.get(handle).unwrap().sequence_len();
}
let start = path_pos.seq_offset() - ref_pos;
let end = start + self.len;
start..end
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
enum EditOperation {
Match,
Insertion,
Deletion,
}
impl Display for EditOperation {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
EditOperation::Match => write!(f, "M"),
EditOperation::Insertion => write!(f, "I"),
EditOperation::Deletion => write!(f, "D"),
}
}
}
#[derive(Clone, Debug)]
pub struct EdgeIter<'a> {
parent: &'a Subgraph,
successors: &'a [Pos],
next: usize,
limit: usize,
flip: bool,
supergraph: bool,
}
impl<'a> EdgeIter<'a> {
pub fn new(parent: &'a Subgraph, record: &'a GBZRecord, flip: bool) -> Self {
let successors = record.edges_slice();
let limit = successors.len();
EdgeIter {
parent,
successors,
next: 0,
limit,
flip,
supergraph: false,
}
}
pub fn in_supergraph(parent: &'a Subgraph, record: &'a GBZRecord, flip: bool) -> Self {
let successors = record.edges_slice();
let limit = successors.len();
EdgeIter {
parent,
successors,
next: 0,
limit,
flip,
supergraph: true,
}
}
}
impl<'a> Iterator for EdgeIter<'a> {
type Item = (usize, Orientation);
fn next(&mut self) -> Option<Self::Item> {
while self.next < self.limit {
let handle = self.successors[self.next].node;
self.next += 1;
if self.supergraph || self.parent.has_handle(handle) {
let node_id = support::node_id(handle);
let orientation = if self.flip {
support::node_orientation(handle).flip()
} else {
support::node_orientation(handle)
};
return Some((node_id, orientation));
}
}
None
}
}
impl<'a> DoubleEndedIterator for EdgeIter<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
while self.next < self.limit {
let handle = self.successors[self.limit - 1].node;
self.limit -= 1;
if self.supergraph || self.parent.has_handle(handle) {
let node_id = support::node_id(handle);
let orientation = if self.flip {
support::node_orientation(handle).flip()
} else {
support::node_orientation(handle)
};
return Some((node_id, orientation));
}
}
None
}
}
impl<'a> FusedIterator for EdgeIter<'a> {}