use std::{
collections::{HashMap, HashSet},
fs::File,
io::BufReader,
str,
};
use anyhow::Result;
use gen_core::{HashId, PATH_END_NODE_ID, PATH_START_NODE_ID, Strand};
use gen_models::{db::GraphConnection, node::Node, path::Path, sequence::Sequence};
use noodles::fasta;
use thiserror::Error;
use crate::graphs::{BlockGroupChunk, GraphError, NodePoint, stitch};
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct SequencePart {
pub name: String,
pub sequence: String,
pub sequence_length: i64,
}
#[derive(Error, Debug)]
pub enum CombinatorialLibraryParseError {
#[error("Failed to parse fasta")]
FastaParseFailed(String),
#[error("Failed to parse library CSV")]
CSVParseFailed(String),
}
#[derive(Error, Debug)]
pub enum CombinatorialLibraryCreationError {
#[error("Failed to create library")]
CreationFailed(String),
#[error("Failed to create library: No parts provided")]
NoParts(String),
#[error("Graph error: {0}")]
GraphError(#[from] GraphError),
}
pub fn parse_library(
parts_filename: &str,
design_filename: &str,
) -> Result<Vec<Vec<SequencePart>>, CombinatorialLibraryParseError> {
let mut sequence_parts_by_name = HashMap::new();
let mut parts_reader = fasta::io::reader::Builder
.build_from_path(parts_filename)
.map_err(|e| CombinatorialLibraryParseError::FastaParseFailed(e.to_string()))?;
for result in parts_reader.records() {
let record =
result.map_err(|e| CombinatorialLibraryParseError::FastaParseFailed(e.to_string()))?;
let sequence = str::from_utf8(record.sequence().as_ref())
.map_err(|e| CombinatorialLibraryParseError::FastaParseFailed(e.to_string()))?;
let name = String::from_utf8(record.name().to_vec()).unwrap();
sequence_parts_by_name.insert(
name.clone(),
SequencePart {
name: name.to_string(),
sequence: sequence.to_string(),
sequence_length: sequence.len() as i64,
},
);
}
let library_file = File::open(design_filename).map_err(|e| {
CombinatorialLibraryParseError::CSVParseFailed(format!(
"Failed to open library file {design_filename}: {}",
e
))
})?;
let library_reader = BufReader::new(library_file);
let mut parts_by_index = HashMap::new();
let mut library_csv_reader = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(library_reader);
let mut max_index = 0;
for result in library_csv_reader.records() {
let record =
result.map_err(|e| CombinatorialLibraryParseError::CSVParseFailed(e.to_string()))?;
for (index, part_name) in record.iter().enumerate() {
if !part_name.is_empty() {
let part = sequence_parts_by_name.get(part_name).ok_or_else(|| {
CombinatorialLibraryParseError::CSVParseFailed(format!(
"Part {part_name} missing."
))
})?;
parts_by_index
.entry(index)
.or_insert(vec![])
.push(part.clone());
if index >= max_index {
max_index = index + 1;
}
}
}
}
let mut parts_list = vec![];
for index in 0..max_index {
let parts = parts_by_index.get(&index).ok_or_else(|| {
CombinatorialLibraryParseError::CSVParseFailed(format!("Missing index {index}."))
})?;
parts_list.push(parts.clone());
}
Ok(parts_list)
}
pub fn create_library(
conn: &GraphConnection,
block_group_id: HashId,
library_name: &str,
parts_list: Vec<Vec<SequencePart>>,
create_block_group: bool,
) -> Result<BlockGroupChunk, CombinatorialLibraryCreationError> {
if parts_list.is_empty() {
return Err(CombinatorialLibraryCreationError::NoParts(
"No parts provided for library creation.".to_string(),
));
}
let mut parts_set = HashSet::new();
for parts in &parts_list {
parts_set.extend(parts);
}
let mut cleaned_parts_list = vec![];
for parts in &parts_list {
let mut unique_parts = HashSet::new();
let mut result_parts = parts.clone();
result_parts.retain(|part| unique_parts.insert(part.clone()));
cleaned_parts_list.push(result_parts);
}
let mut sequence_hashes_by_name = HashMap::new();
for part in parts_set {
let seq = Sequence::new()
.sequence_type("DNA")
.sequence(&part.sequence)
.save(conn);
sequence_hashes_by_name.insert(part.name.clone(), seq.hash);
}
let mut part_nodes_list = vec![];
let mut sequence_lengths_by_node_id = HashMap::new();
if create_block_group {
part_nodes_list.push(vec![PATH_START_NODE_ID]);
sequence_lengths_by_node_id.insert(PATH_START_NODE_ID, 0);
}
for (index, parts) in cleaned_parts_list.iter().enumerate() {
let mut part_nodes = vec![];
for part in parts {
let part_hash = sequence_hashes_by_name.get(&part.name).ok_or_else(|| {
CombinatorialLibraryCreationError::CreationFailed(format!(
"Part {} missing.",
part.name
))
})?;
let part_node_id = Node::create(
conn,
part_hash,
&HashId::convert_str(&format!(
"{library_name}:{part_name}:{ref_start}-{ref_end}->{sequence_hash}-column-{index}",
part_name = part.name,
ref_start = 0,
ref_end = part.sequence_length,
sequence_hash = part_hash
)),
);
part_nodes.push(part_node_id);
sequence_lengths_by_node_id.insert(part_node_id, part.sequence_length);
}
part_nodes_list.push(part_nodes);
}
if create_block_group {
part_nodes_list.push(vec![PATH_END_NODE_ID]);
sequence_lengths_by_node_id.insert(PATH_END_NODE_ID, 0);
}
let mut source_node_points = vec![];
let mut target_node_points = vec![];
for part in &part_nodes_list[0] {
target_node_points.push(NodePoint {
id: *part,
coordinate: 0,
strand: Strand::Forward,
});
let part_length = sequence_lengths_by_node_id.get(part).ok_or_else(|| {
CombinatorialLibraryCreationError::CreationFailed(format!("Part {part} missing."))
})?;
source_node_points.push(NodePoint {
id: *part,
coordinate: *part_length,
strand: Strand::Forward,
});
}
let (path_start_point, path_end_point) = if create_block_group {
(
Some(target_node_points[0].clone()),
Some(source_node_points[0].clone()),
)
} else {
(None, None)
};
let mut result_block_group_chunk = BlockGroupChunk {
entry_node_points: target_node_points.clone(),
exit_node_points: source_node_points.clone(),
path_edges: vec![],
path_start_point,
path_end_point,
};
for parts in &part_nodes_list[1..] {
let target_node_points: Vec<NodePoint> = parts
.iter()
.map(|part| NodePoint {
id: *part,
coordinate: 0,
strand: Strand::Forward,
})
.collect();
source_node_points = vec![];
for part in parts {
let part_length = sequence_lengths_by_node_id.get(part).ok_or_else(|| {
CombinatorialLibraryCreationError::CreationFailed(format!("Part {part} missing."))
})?;
source_node_points.push(NodePoint {
id: *part,
coordinate: *part_length,
strand: Strand::Forward,
});
}
let (path_start_point, path_end_point) = if create_block_group {
(
Some(target_node_points[0].clone()),
Some(source_node_points[0].clone()),
)
} else {
(None, None)
};
let target_chunk = BlockGroupChunk {
entry_node_points: target_node_points.clone(),
exit_node_points: source_node_points.clone(),
path_edges: vec![],
path_start_point,
path_end_point,
};
result_block_group_chunk = stitch(
conn,
&result_block_group_chunk,
&target_chunk,
block_group_id,
)?;
}
if create_block_group {
let path_edge_ids = result_block_group_chunk
.clone()
.path_edges
.into_iter()
.map(|edge| edge.id)
.collect::<Vec<HashId>>();
Path::create(
conn,
format!("{library_name} default path").as_str(),
&block_group_id,
&path_edge_ids,
);
}
Ok(result_block_group_chunk)
}
#[cfg(test)]
mod tests {
use gen_models::{
block_group::{BlockGroup, NewBlockGroup},
sample::Sample,
};
use super::*;
use crate::{test_helpers::setup_gen, track_database};
#[test]
fn no_parts_in_list() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let collection = "test";
let sample_name = "test-sample";
let _sample = Sample::get_or_create(conn, sample_name);
let block_group = BlockGroup::create(
conn,
NewBlockGroup {
collection_name: collection,
sample_name,
name: "test-block-group",
..Default::default()
},
);
match create_library(conn, block_group.id, "library", vec![], false) {
Ok(_) => {
panic!("expected Err(CombinatorialLibraryCreationError::CreationFailed), got Ok")
}
Err(CombinatorialLibraryCreationError::NoParts(_)) => (),
Err(_) => panic!(
"expected Err(CombinatorialLibraryCreationError::CreationFailed), got Err(_)"
),
}
}
}