use std::{io::Read, str};
use gb_io::reader;
use gen_core::{HashId, PATH_END_NODE_ID, PATH_START_NODE_ID, PathBlock, Strand};
use gen_models::{
block_group::{BlockGroup, PathChange},
block_group_edge::{BlockGroupEdge, BlockGroupEdgeData},
collection::Collection,
db::DbContext,
edge::Edge,
node::Node,
operations::{Operation, OperationInfo},
path::Path,
sample::Sample,
sequence::Sequence,
session_operations::{end_operation, start_operation},
};
use crate::{
genbank::{EditType, GenBankError, process_sequence},
progress_bar::{add_saving_operation_bar, get_handler, get_progress_bar},
};
pub fn import_genbank<'a, R>(
context: &DbContext,
data: R,
collection: impl Into<Option<&'a str>>,
sample: impl Into<Option<&'a str>>,
operation_info: OperationInfo,
) -> Result<Operation, GenBankError>
where
R: Read,
{
let conn = context.graph().conn();
let progress_bar = get_handler();
let mut session = start_operation(conn);
let reader = reader::SeqReader::new(data);
let collection = Collection::create(conn, collection.into().unwrap_or_default());
let sample = sample.into();
if let Some(sample_name) = sample {
Sample::get_or_create(conn, sample_name);
}
let _ = progress_bar.println("Parsing GenBank");
let bar = progress_bar.add(get_progress_bar(None));
bar.set_message("Entries parsed");
for result in reader {
match result {
Ok(seq) => {
let locus = process_sequence(seq)?;
let original_seq = locus.original_sequence();
let mut seq_model = Sequence::new().sequence(&original_seq);
if !locus.name.is_empty() {
seq_model = seq_model.name(&locus.name);
}
if let Some(ref mol_type) = locus.molecule_type {
seq_model = seq_model.sequence_type(mol_type);
}
let sequence = seq_model.save(conn);
let wt_node_id = Node::create(
conn,
&sequence.hash,
&HashId::convert_str(&format!(
"{collection}.{contig}:{hash}",
collection = &collection.name,
contig = &locus.name,
hash = sequence.hash
)),
);
let block_group = BlockGroup::create(conn, &collection.name, sample, &locus.name);
let edge_into = Edge::create(
conn,
PATH_START_NODE_ID,
0,
Strand::Forward,
wt_node_id,
0,
Strand::Forward,
);
let edge_out_of = Edge::create(
conn,
wt_node_id,
sequence.length,
Strand::Forward,
PATH_END_NODE_ID,
0,
Strand::Forward,
);
BlockGroupEdge::bulk_create(
conn,
&[
BlockGroupEdgeData {
block_group_id: block_group.id,
edge_id: edge_into.id,
chromosome_index: 0,
phased: 0,
},
BlockGroupEdgeData {
block_group_id: block_group.id,
edge_id: edge_out_of.id,
chromosome_index: 0,
phased: 0,
},
],
);
let path = Path::create(
conn,
&locus.name,
&block_group.id,
&[edge_into.id, edge_out_of.id],
);
for edit in locus.changes_to_wt() {
let start = edit.start;
let end = edit.end;
let change = match edit.edit_type {
EditType::Insertion | EditType::Replacement => {
let change_seq = Sequence::new()
.sequence(&edit.new_sequence)
.name(&format!(
"Geneious type: Editing History {edit_type}",
edit_type = edit.edit_type
))
.sequence_type("DNA")
.save(conn);
let change_node = Node::create(
conn,
&change_seq.hash,
&HashId::convert_str(&format!(
"{parent_hash}:{start}-{end}->{new_hash}",
parent_hash = &sequence.hash,
new_hash = &change_seq.hash,
)),
);
PathChange {
block_group_id: block_group.id,
path: path.clone(),
path_accession: None,
start,
end,
block: PathBlock {
id: 0,
node_id: change_node,
block_sequence: edit.new_sequence.clone(),
sequence_start: 0,
sequence_end: change_seq.length,
path_start: start,
path_end: end + change_seq.length,
strand: Strand::Forward,
},
chromosome_index: 1,
phased: 0,
preserve_edge: true,
}
}
EditType::Deletion => PathChange {
block_group_id: block_group.id,
path: path.clone(),
path_accession: None,
start,
end,
block: PathBlock {
id: 0,
node_id: wt_node_id,
block_sequence: "".to_string(),
sequence_start: 0,
sequence_end: 0,
path_start: start,
path_end: end,
strand: Strand::Forward,
},
chromosome_index: 1,
phased: 0,
preserve_edge: true,
},
};
let tree = path.intervaltree(conn);
BlockGroup::insert_change(conn, &change, &tree).unwrap();
}
}
Err(e) => return Err(GenBankError::ParseError(format!("Failed to parse {e}"))),
}
bar.inc(1);
}
bar.finish();
let bar = add_saving_operation_bar(&progress_bar);
let op = end_operation(
context,
&mut session,
&operation_info,
&format!(
"Genbank Import of {files}",
files = operation_info
.files
.iter()
.map(|f| f.file_path.clone())
.collect::<Vec<_>>()
.join(",")
),
None,
)
.map_err(GenBankError::OperationError);
bar.finish();
op
}
#[cfg(test)]
mod tests {
use std::{collections::HashSet, fs::File, io::BufReader, path::PathBuf};
use gen_models::{file_types::FileTypes, operations::OperationFile, traits::Query};
use noodles::fasta;
use super::*;
use crate::{test_helpers::setup_gen, track_database};
fn get_unmodified_sequence() -> String {
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/unmodified.fa");
let mut reader = fasta::io::reader::Builder.build_from_path(path).unwrap();
let mut records = reader.records();
let record = records.next().unwrap().unwrap();
let seq = record.sequence();
str::from_utf8(seq.as_ref()).unwrap().to_string()
}
#[test]
fn test_error_on_invalid_file() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
assert_eq!(
import_genbank(
&context,
BufReader::new("this is not valid".as_bytes()),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
}
),
Err(GenBankError::ParseError(
"Failed to parse Syntax error: Error MapRes while parsing [this is not valid]"
.to_string()
))
)
}
#[test]
fn test_records_operation() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/insertion.gb");
let file = File::open(&path).unwrap();
let operation = import_genbank(
&context,
BufReader::new(file),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: path.to_str().unwrap().to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
)
.unwrap();
assert_eq!(
Operation::get_by_id(op_conn, &operation.hash).unwrap(),
operation
);
}
#[test]
fn test_creates_sample() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/insertion.gb");
let file = File::open(&path).unwrap();
let _ = import_genbank(
&context,
BufReader::new(file),
None,
"new-sample",
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
);
assert_eq!(
Sample::get_by_name(conn, "new-sample").unwrap().name,
"new-sample"
);
}
#[cfg(test)]
mod geneious_genbanks {
use super::*;
use crate::{normalize_string, track_database};
#[test]
fn test_parses_insertion() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/insertion.gb");
let file = File::open(&path).unwrap();
let _ = import_genbank(
&context,
BufReader::new(file),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
);
let f = reader::parse_file(&path).unwrap();
let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
let block_group_id = BlockGroup::get_id("", None, "insertion");
let seqs = BlockGroup::get_all_sequences(conn, &block_group_id, false);
assert_eq!(
seqs,
HashSet::from_iter([
seq.clone(),
format!("{}{}", &seq[..1425].to_string(), &seq[2220..].to_string()).to_string()
])
);
}
#[test]
fn test_parses_deletion() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/deletion.gb");
let file = File::open(&path).unwrap();
let _ = import_genbank(
&context,
BufReader::new(file),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
);
let f = reader::parse_file(&path).unwrap();
let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
let deleted: String = normalize_string(
"TTACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATT
CATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCCAGCG
GCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGGCGAAGAAGT
TGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGGATTGGCTGAGACGA
AAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTTTCACCGTAACACGCCACAT
CTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGTGGTATTCACTCCAGAGCGATGAAA
ACGTTTCAGTTTGCTCATGGAAAACGGTGTAACAAGGGTGAACACTATCCCATATCACCAGCT
CACCGTCTTTCATTGCCATACGGAATTCCGGATGAGCATTCATCAGGCGGGCAAGAATGTGAA
TAAAGGCCGGATAAAACTTGTGCTTATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCA
GCTGAACGGTCTGGTTATAGGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTAC
GATGCCATTGGGATATATCAACGGTGGTATATCCAGTGATTTTTTTCTCCAT",
);
let block_group_id = BlockGroup::get_id("", None, "deletion");
let seqs = BlockGroup::get_all_sequences(conn, &block_group_id, false);
assert_eq!(
seqs,
HashSet::from_iter([
seq.clone(),
format!(
"{}{deleted}{}",
&seq[..765].to_string(),
&seq[765..].to_string()
)
.to_string()
])
);
}
#[test]
fn test_parses_deletion_and_insertion() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/deletion_and_insertion.gb");
let file = File::open(&path).unwrap();
let _ = import_genbank(
&context,
BufReader::new(file),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
);
let f = reader::parse_file(&path).unwrap();
let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
let deleted: String = normalize_string(
"TACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTC
ATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCC
AGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGG
CGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGG
ATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTT
TCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGT
GGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACA
AGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCC
GGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCT
TATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATA
GGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGAT
ATATCAACGGTGGTATATCCAGTGATTTTTTTCTC",
);
let seqs = BlockGroup::get_all_sequences(
conn,
&BlockGroup::get_id("", None, "deletion_and_insertion"),
false,
);
assert_eq!(
seqs,
HashSet::from_iter([
seq.clone(),
format!(
"{}{deleted}{}",
&seq[..766].to_string(),
&seq[1557..].to_string()
)
.to_string()
])
);
}
#[test]
fn test_parses_substitution() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/substitution.gb");
let file = File::open(&path).unwrap();
let _ = import_genbank(
&context,
BufReader::new(file),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
);
let f = reader::parse_file(&path).unwrap();
let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
let deleted: String = normalize_string(
"TACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTC
ATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCC
AGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGG
CGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGG
ATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTT
TCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGT
GGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACA
AGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCC
GGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCT
TATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATA
GGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGAT
ATATCAACGGTGGTATATCCAGTGATTTTTTTCTC",
);
let seqs = BlockGroup::get_all_sequences(
conn,
&BlockGroup::get_id("", None, "substitution"),
false,
);
assert_eq!(
seqs,
HashSet::from_iter([
seq.clone(),
format!(
"{}{deleted}{}",
&seq[..766].to_string(),
&seq[1557..].to_string()
)
.to_string()
])
);
}
#[test]
fn test_parses_multiple_changes() {
let context = setup_gen();
let conn = context.graph().conn();
let op_conn = context.operations().conn();
track_database(conn, op_conn).unwrap();
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("fixtures/geneious_genbank/multiple_insertions_deletions.gb");
let file = File::open(&path).unwrap();
let _ = import_genbank(
&context,
BufReader::new(file),
None,
None,
OperationInfo {
files: vec![OperationFile {
file_path: "".to_string(),
file_type: FileTypes::GenBank,
}],
description: "test".to_string(),
},
);
let f = reader::parse_file(&path).unwrap();
let mod_seq = str::from_utf8(&f[0].seq).unwrap().to_string();
let sequences: HashSet<String> = BlockGroup::get_all_sequences(
conn,
&BlockGroup::get_id("", None, "insertion"),
false,
)
.iter()
.map(|s| s.to_lowercase())
.collect();
let unchanged_seq = get_unmodified_sequence();
assert!(sequences.contains(&mod_seq));
assert!(sequences.contains(&unchanged_seq));
}
}
}