1use crate::calculate_hash;
2use crate::genbank::{process_sequence, EditType, GenBankError};
3use crate::models::block_group::{BlockGroup, PathChange};
4use crate::models::block_group_edge::{BlockGroupEdge, BlockGroupEdgeData};
5use crate::models::collection::Collection;
6use crate::models::edge::Edge;
7use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID};
8use crate::models::operations::{Operation, OperationInfo};
9use crate::models::path::{Path, PathBlock};
10use crate::models::sample::Sample;
11use crate::models::sequence::Sequence;
12use crate::models::strand::Strand;
13use crate::operation_management::{end_operation, start_operation};
14use crate::progress_bar::{add_saving_operation_bar, get_handler, get_progress_bar};
15use gb_io::reader;
16use rusqlite::Connection;
17use std::io::Read;
18use std::str;
19
20pub fn import_genbank<'a, R>(
21 conn: &Connection,
22 op_conn: &Connection,
23 data: R,
24 collection: impl Into<Option<&'a str>>,
25 sample: impl Into<Option<&'a str>>,
26 operation_info: OperationInfo,
27) -> Result<Operation, GenBankError>
28where
29 R: Read,
30{
31 let progress_bar = get_handler();
32 let mut session = start_operation(conn);
33 let reader = reader::SeqReader::new(data);
34 let collection = Collection::create(conn, collection.into().unwrap_or_default());
35 let sample = sample.into();
36
37 if let Some(sample_name) = sample {
38 Sample::get_or_create(conn, sample_name);
39 }
40
41 let _ = progress_bar.println("Parsing GenBank");
42 let bar = progress_bar.add(get_progress_bar(None));
43 bar.set_message("Entries parsed");
44 for result in reader {
45 match result {
46 Ok(seq) => {
47 let locus = process_sequence(seq)?;
48 let original_seq = locus.original_sequence();
49 let mut seq_model = Sequence::new().sequence(&original_seq);
50 if !locus.name.is_empty() {
51 seq_model = seq_model.name(&locus.name);
52 }
53 if let Some(ref mol_type) = locus.molecule_type {
54 seq_model = seq_model.sequence_type(mol_type);
55 }
56 let sequence = seq_model.save(conn);
57 let wt_node_id = Node::create(
58 conn,
59 &sequence.hash,
60 calculate_hash(&format!(
61 "{collection}.{contig}:{hash}",
62 collection = &collection.name,
63 contig = &locus.name,
64 hash = sequence.hash
65 )),
66 );
67
68 let block_group = BlockGroup::create(conn, &collection.name, sample, &locus.name);
69 let edge_into = Edge::create(
70 conn,
71 PATH_START_NODE_ID,
72 0,
73 Strand::Forward,
74 wt_node_id,
75 0,
76 Strand::Forward,
77 );
78 let edge_out_of = Edge::create(
79 conn,
80 wt_node_id,
81 sequence.length,
82 Strand::Forward,
83 PATH_END_NODE_ID,
84 0,
85 Strand::Forward,
86 );
87 BlockGroupEdge::bulk_create(
88 conn,
89 &[
90 BlockGroupEdgeData {
91 block_group_id: block_group.id,
92 edge_id: edge_into.id,
93 chromosome_index: 0,
94 phased: 0,
95 },
96 BlockGroupEdgeData {
97 block_group_id: block_group.id,
98 edge_id: edge_out_of.id,
99 chromosome_index: 0,
100 phased: 0,
101 },
102 ],
103 );
104 let path = Path::create(
105 conn,
106 &locus.name,
107 block_group.id,
108 &[edge_into.id, edge_out_of.id],
109 );
110
111 for edit in locus.changes_to_wt() {
112 let start = edit.start;
113 let end = edit.end;
114 let change = match edit.edit_type {
115 EditType::Insertion | EditType::Replacement => {
116 let change_seq = Sequence::new()
117 .sequence(&edit.new_sequence)
118 .name(&format!(
119 "Geneious type: Editing History {edit_type}",
120 edit_type = edit.edit_type
121 ))
122 .sequence_type("DNA")
123 .save(conn);
124 let change_node = Node::create(
125 conn,
126 &change_seq.hash,
127 calculate_hash(&format!(
128 "{parent_hash}:{start}-{end}->{new_hash}",
129 parent_hash = &sequence.hash,
130 new_hash = &change_seq.hash,
131 )),
132 );
133 PathChange {
134 block_group_id: block_group.id,
135 path: path.clone(),
136 path_accession: None,
137 start,
138 end,
139 block: PathBlock {
140 id: 0,
141 node_id: change_node,
142 block_sequence: edit.new_sequence.clone(),
143 sequence_start: 0,
144 sequence_end: change_seq.length,
145 path_start: start,
146 path_end: end + change_seq.length,
147 strand: Strand::Forward,
148 },
149 chromosome_index: 0,
150 phased: 0,
151 }
152 }
153 EditType::Deletion => PathChange {
154 block_group_id: block_group.id,
155 path: path.clone(),
156 path_accession: None,
157 start,
158 end,
159 block: PathBlock {
160 id: 0,
161 node_id: wt_node_id,
162 block_sequence: "".to_string(),
163 sequence_start: 0,
164 sequence_end: 0,
165 path_start: start,
166 path_end: end,
167 strand: Strand::Forward,
168 },
169 chromosome_index: 0,
170 phased: 0,
171 },
172 };
173 let tree = path.intervaltree(conn);
174 BlockGroup::insert_change(conn, &change, &tree);
175 }
176 }
177 Err(e) => return Err(GenBankError::ParseError(format!("Failed to parse {}", e))),
178 }
179 bar.inc(1);
180 }
181 bar.finish();
182 let filename = operation_info.file_path.clone();
183 let bar = add_saving_operation_bar(&progress_bar);
184 let op = end_operation(
185 conn,
186 op_conn,
187 &mut session,
188 operation_info,
189 &format!("Genbank Import of {filename}",),
190 None,
191 )
192 .map_err(GenBankError::OperationError);
193 bar.finish();
194 op
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200 use crate::models::file_types::FileTypes;
201 use crate::models::metadata;
202 use crate::models::operations::setup_db;
203 use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir};
204 use noodles::fasta;
205 use std::collections::HashSet;
206 use std::fs::File;
207 use std::io::BufReader;
208 use std::path::PathBuf;
209
210 fn get_unmodified_sequence() -> String {
211 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
212 .join("fixtures/geneious_genbank/unmodified.fa");
213 let mut reader = fasta::io::reader::Builder.build_from_path(path).unwrap();
214 let mut records = reader.records();
215 let record = records.next().unwrap().unwrap();
216 let seq = record.sequence();
217 str::from_utf8(seq.as_ref()).unwrap().to_string()
218 }
219
220 #[test]
221 fn test_error_on_invalid_file() {
222 setup_gen_dir();
223 let conn = &get_connection(None);
224 let db_uuid = metadata::get_db_uuid(conn);
225 let op_conn = &get_operation_connection(None);
226 setup_db(op_conn, &db_uuid);
227 assert_eq!(
228 import_genbank(
229 conn,
230 op_conn,
231 BufReader::new("this is not valid".as_bytes()),
232 None,
233 None,
234 OperationInfo {
235 file_path: "".to_string(),
236 file_type: FileTypes::GenBank,
237 description: "test".to_string(),
238 }
239 ),
240 Err(GenBankError::ParseError(
241 "Failed to parse Syntax error: Error Tag while parsing [this is not valid]"
242 .to_string()
243 ))
244 )
245 }
246
247 #[test]
248 fn test_records_operation() {
249 setup_gen_dir();
250 let conn = &get_connection(None);
251 let db_uuid = metadata::get_db_uuid(conn);
252 let op_conn = &get_operation_connection(None);
253 setup_db(op_conn, &db_uuid);
254 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
255 .join("fixtures/geneious_genbank/insertion.gb");
256 let file = File::open(&path).unwrap();
257 let operation = import_genbank(
258 conn,
259 op_conn,
260 BufReader::new(file),
261 None,
262 None,
263 OperationInfo {
264 file_path: path.to_str().unwrap().to_string(),
265 file_type: FileTypes::GenBank,
266 description: "test".to_string(),
267 },
268 )
269 .unwrap();
270 assert_eq!(
271 Operation::get_by_hash(op_conn, &operation.hash).unwrap(),
272 operation
273 );
274 }
275
276 #[test]
277 fn test_creates_sample() {
278 setup_gen_dir();
279 let conn = &get_connection(None);
280 let db_uuid = metadata::get_db_uuid(conn);
281 let op_conn = &get_operation_connection(None);
282 setup_db(op_conn, &db_uuid);
283 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
284 .join("fixtures/geneious_genbank/insertion.gb");
285 let file = File::open(&path).unwrap();
286 let _ = import_genbank(
287 conn,
288 op_conn,
289 BufReader::new(file),
290 None,
291 "new-sample",
292 OperationInfo {
293 file_path: "".to_string(),
294 file_type: FileTypes::GenBank,
295 description: "test".to_string(),
296 },
297 );
298 assert_eq!(
299 Sample::get_by_name(conn, "new-sample").unwrap().name,
300 "new-sample"
301 );
302 }
303
304 #[cfg(test)]
305 mod geneious_genbanks {
306 use super::*;
307 use crate::normalize_string;
308
309 #[test]
310 fn test_parses_insertion() {
311 setup_gen_dir();
312 let conn = &get_connection(None);
314 let db_uuid = metadata::get_db_uuid(conn);
315 let op_conn = &get_operation_connection(None);
316 setup_db(op_conn, &db_uuid);
317 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
318 .join("fixtures/geneious_genbank/insertion.gb");
319 let file = File::open(&path).unwrap();
320 let _ = import_genbank(
321 conn,
322 op_conn,
323 BufReader::new(file),
324 None,
325 None,
326 OperationInfo {
327 file_path: "".to_string(),
328 file_type: FileTypes::GenBank,
329 description: "test".to_string(),
330 },
331 );
332 let f = reader::parse_file(&path).unwrap();
333 let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
334 let seqs = BlockGroup::get_all_sequences(conn, 1, false);
335 assert_eq!(
336 seqs,
337 HashSet::from_iter([
338 seq.clone(),
339 format!("{}{}", &seq[..1425].to_string(), &seq[2220..].to_string()).to_string()
340 ])
341 );
342 }
343
344 #[test]
345 fn test_parses_deletion() {
346 setup_gen_dir();
347 let conn = &get_connection(None);
349 let db_uuid = metadata::get_db_uuid(conn);
350 let op_conn = &get_operation_connection(None);
351 setup_db(op_conn, &db_uuid);
352 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
353 .join("fixtures/geneious_genbank/deletion.gb");
354 let file = File::open(&path).unwrap();
355 let _ = import_genbank(
356 conn,
357 op_conn,
358 BufReader::new(file),
359 None,
360 None,
361 OperationInfo {
362 file_path: "".to_string(),
363 file_type: FileTypes::GenBank,
364 description: "test".to_string(),
365 },
366 );
367 let f = reader::parse_file(&path).unwrap();
368 let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
369 let deleted: String = normalize_string(
370 "TTACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATT
371 CATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCCAGCG
372 GCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGGCGAAGAAGT
373 TGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGGATTGGCTGAGACGA
374 AAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTTTCACCGTAACACGCCACAT
375 CTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGTGGTATTCACTCCAGAGCGATGAAA
376 ACGTTTCAGTTTGCTCATGGAAAACGGTGTAACAAGGGTGAACACTATCCCATATCACCAGCT
377 CACCGTCTTTCATTGCCATACGGAATTCCGGATGAGCATTCATCAGGCGGGCAAGAATGTGAA
378 TAAAGGCCGGATAAAACTTGTGCTTATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCA
379 GCTGAACGGTCTGGTTATAGGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTAC
380 GATGCCATTGGGATATATCAACGGTGGTATATCCAGTGATTTTTTTCTCCAT",
381 );
382 let seqs = BlockGroup::get_all_sequences(conn, 1, false);
383 assert_eq!(
384 seqs,
385 HashSet::from_iter([
386 seq.clone(),
387 format!(
388 "{}{deleted}{}",
389 &seq[..765].to_string(),
390 &seq[765..].to_string()
391 )
392 .to_string()
393 ])
394 );
395 }
396
397 #[test]
398 fn test_parses_deletion_and_insertion() {
399 setup_gen_dir();
400 let conn = &get_connection(None);
401 let db_uuid = metadata::get_db_uuid(conn);
402 let op_conn = &get_operation_connection(None);
403 setup_db(op_conn, &db_uuid);
404 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
405 .join("fixtures/geneious_genbank/deletion_and_insertion.gb");
406 let file = File::open(&path).unwrap();
407 let _ = import_genbank(
408 conn,
409 op_conn,
410 BufReader::new(file),
411 None,
412 None,
413 OperationInfo {
414 file_path: "".to_string(),
415 file_type: FileTypes::GenBank,
416 description: "test".to_string(),
417 },
418 );
419 let f = reader::parse_file(&path).unwrap();
420 let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
421 let deleted: String = normalize_string(
422 "TACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTC
423 ATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCC
424 AGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGG
425 CGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGG
426 ATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTT
427 TCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGT
428 GGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACA
429 AGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCC
430 GGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCT
431 TATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATA
432 GGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGAT
433 ATATCAACGGTGGTATATCCAGTGATTTTTTTCTC",
434 );
435 let seqs = BlockGroup::get_all_sequences(conn, 1, false);
436 assert_eq!(
437 seqs,
438 HashSet::from_iter([
439 seq.clone(),
440 format!(
441 "{}{deleted}{}",
442 &seq[..766].to_string(),
443 &seq[1557..].to_string()
444 )
445 .to_string()
446 ])
447 );
448 }
449
450 #[test]
451 fn test_parses_substitution() {
452 setup_gen_dir();
453 let conn = &get_connection(None);
456 let db_uuid = metadata::get_db_uuid(conn);
457 let op_conn = &get_operation_connection(None);
458 setup_db(op_conn, &db_uuid);
459 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
460 .join("fixtures/geneious_genbank/substitution.gb");
461 let file = File::open(&path).unwrap();
462 let _ = import_genbank(
463 conn,
464 op_conn,
465 BufReader::new(file),
466 None,
467 None,
468 OperationInfo {
469 file_path: "".to_string(),
470 file_type: FileTypes::GenBank,
471 description: "test".to_string(),
472 },
473 );
474 let f = reader::parse_file(&path).unwrap();
475 let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
476 let deleted: String = normalize_string(
477 "TACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTC
478 ATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCC
479 AGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGG
480 CGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGG
481 ATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTT
482 TCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGT
483 GGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACA
484 AGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCC
485 GGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCT
486 TATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATA
487 GGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGAT
488 ATATCAACGGTGGTATATCCAGTGATTTTTTTCTC",
489 );
490 let seqs = BlockGroup::get_all_sequences(conn, 1, false);
491 assert_eq!(
492 seqs,
493 HashSet::from_iter([
494 seq.clone(),
495 format!(
496 "{}{deleted}{}",
497 &seq[..766].to_string(),
498 &seq[1557..].to_string()
499 )
500 .to_string()
501 ])
502 );
503 }
504
505 #[test]
506 fn test_parses_multiple_changes() {
507 setup_gen_dir();
508 let conn = &get_connection(None);
509 let db_uuid = metadata::get_db_uuid(conn);
510 let op_conn = &get_operation_connection(None);
511 setup_db(op_conn, &db_uuid);
512 let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
513 .join("fixtures/geneious_genbank/multiple_insertions_deletions.gb");
514 let file = File::open(&path).unwrap();
515 let _ = import_genbank(
516 conn,
517 op_conn,
518 BufReader::new(file),
519 None,
520 None,
521 OperationInfo {
522 file_path: "".to_string(),
523 file_type: FileTypes::GenBank,
524 description: "test".to_string(),
525 },
526 );
527 let f = reader::parse_file(&path).unwrap();
529 let mod_seq = str::from_utf8(&f[0].seq).unwrap().to_string();
530 let sequences: HashSet<String> = BlockGroup::get_all_sequences(conn, 1, false)
531 .iter()
532 .map(|s| s.to_lowercase())
533 .collect();
534 let unchanged_seq = get_unmodified_sequence();
535 assert!(sequences.contains(&mod_seq));
536 assert!(sequences.contains(&unchanged_seq));
537 }
538 }
539}