gen/imports/
genbank.rs

1use crate::calculate_hash;
2use crate::genbank::{process_sequence, EditType, GenBankError};
3use crate::models::block_group::{BlockGroup, PathChange};
4use crate::models::block_group_edge::{BlockGroupEdge, BlockGroupEdgeData};
5use crate::models::collection::Collection;
6use crate::models::edge::Edge;
7use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID};
8use crate::models::operations::{Operation, OperationInfo};
9use crate::models::path::{Path, PathBlock};
10use crate::models::sample::Sample;
11use crate::models::sequence::Sequence;
12use crate::models::strand::Strand;
13use crate::operation_management::{end_operation, start_operation};
14use crate::progress_bar::{add_saving_operation_bar, get_handler, get_progress_bar};
15use gb_io::reader;
16use rusqlite::Connection;
17use std::io::Read;
18use std::str;
19
20pub fn import_genbank<'a, R>(
21    conn: &Connection,
22    op_conn: &Connection,
23    data: R,
24    collection: impl Into<Option<&'a str>>,
25    sample: impl Into<Option<&'a str>>,
26    operation_info: OperationInfo,
27) -> Result<Operation, GenBankError>
28where
29    R: Read,
30{
31    let progress_bar = get_handler();
32    let mut session = start_operation(conn);
33    let reader = reader::SeqReader::new(data);
34    let collection = Collection::create(conn, collection.into().unwrap_or_default());
35    let sample = sample.into();
36
37    if let Some(sample_name) = sample {
38        Sample::get_or_create(conn, sample_name);
39    }
40
41    let _ = progress_bar.println("Parsing GenBank");
42    let bar = progress_bar.add(get_progress_bar(None));
43    bar.set_message("Entries parsed");
44    for result in reader {
45        match result {
46            Ok(seq) => {
47                let locus = process_sequence(seq)?;
48                let original_seq = locus.original_sequence();
49                let mut seq_model = Sequence::new().sequence(&original_seq);
50                if !locus.name.is_empty() {
51                    seq_model = seq_model.name(&locus.name);
52                }
53                if let Some(ref mol_type) = locus.molecule_type {
54                    seq_model = seq_model.sequence_type(mol_type);
55                }
56                let sequence = seq_model.save(conn);
57                let wt_node_id = Node::create(
58                    conn,
59                    &sequence.hash,
60                    calculate_hash(&format!(
61                        "{collection}.{contig}:{hash}",
62                        collection = &collection.name,
63                        contig = &locus.name,
64                        hash = sequence.hash
65                    )),
66                );
67
68                let block_group = BlockGroup::create(conn, &collection.name, sample, &locus.name);
69                let edge_into = Edge::create(
70                    conn,
71                    PATH_START_NODE_ID,
72                    0,
73                    Strand::Forward,
74                    wt_node_id,
75                    0,
76                    Strand::Forward,
77                );
78                let edge_out_of = Edge::create(
79                    conn,
80                    wt_node_id,
81                    sequence.length,
82                    Strand::Forward,
83                    PATH_END_NODE_ID,
84                    0,
85                    Strand::Forward,
86                );
87                BlockGroupEdge::bulk_create(
88                    conn,
89                    &[
90                        BlockGroupEdgeData {
91                            block_group_id: block_group.id,
92                            edge_id: edge_into.id,
93                            chromosome_index: 0,
94                            phased: 0,
95                        },
96                        BlockGroupEdgeData {
97                            block_group_id: block_group.id,
98                            edge_id: edge_out_of.id,
99                            chromosome_index: 0,
100                            phased: 0,
101                        },
102                    ],
103                );
104                let path = Path::create(
105                    conn,
106                    &locus.name,
107                    block_group.id,
108                    &[edge_into.id, edge_out_of.id],
109                );
110
111                for edit in locus.changes_to_wt() {
112                    let start = edit.start;
113                    let end = edit.end;
114                    let change = match edit.edit_type {
115                        EditType::Insertion | EditType::Replacement => {
116                            let change_seq = Sequence::new()
117                                .sequence(&edit.new_sequence)
118                                .name(&format!(
119                                    "Geneious type: Editing History {edit_type}",
120                                    edit_type = edit.edit_type
121                                ))
122                                .sequence_type("DNA")
123                                .save(conn);
124                            let change_node = Node::create(
125                                conn,
126                                &change_seq.hash,
127                                calculate_hash(&format!(
128                                    "{parent_hash}:{start}-{end}->{new_hash}",
129                                    parent_hash = &sequence.hash,
130                                    new_hash = &change_seq.hash,
131                                )),
132                            );
133                            PathChange {
134                                block_group_id: block_group.id,
135                                path: path.clone(),
136                                path_accession: None,
137                                start,
138                                end,
139                                block: PathBlock {
140                                    id: 0,
141                                    node_id: change_node,
142                                    block_sequence: edit.new_sequence.clone(),
143                                    sequence_start: 0,
144                                    sequence_end: change_seq.length,
145                                    path_start: start,
146                                    path_end: end + change_seq.length,
147                                    strand: Strand::Forward,
148                                },
149                                chromosome_index: 0,
150                                phased: 0,
151                            }
152                        }
153                        EditType::Deletion => PathChange {
154                            block_group_id: block_group.id,
155                            path: path.clone(),
156                            path_accession: None,
157                            start,
158                            end,
159                            block: PathBlock {
160                                id: 0,
161                                node_id: wt_node_id,
162                                block_sequence: "".to_string(),
163                                sequence_start: 0,
164                                sequence_end: 0,
165                                path_start: start,
166                                path_end: end,
167                                strand: Strand::Forward,
168                            },
169                            chromosome_index: 0,
170                            phased: 0,
171                        },
172                    };
173                    let tree = path.intervaltree(conn);
174                    BlockGroup::insert_change(conn, &change, &tree);
175                }
176            }
177            Err(e) => return Err(GenBankError::ParseError(format!("Failed to parse {}", e))),
178        }
179        bar.inc(1);
180    }
181    bar.finish();
182    let filename = operation_info.file_path.clone();
183    let bar = add_saving_operation_bar(&progress_bar);
184    let op = end_operation(
185        conn,
186        op_conn,
187        &mut session,
188        operation_info,
189        &format!("Genbank Import of {filename}",),
190        None,
191    )
192    .map_err(GenBankError::OperationError);
193    bar.finish();
194    op
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200    use crate::models::file_types::FileTypes;
201    use crate::models::metadata;
202    use crate::models::operations::setup_db;
203    use crate::test_helpers::{get_connection, get_operation_connection, setup_gen_dir};
204    use noodles::fasta;
205    use std::collections::HashSet;
206    use std::fs::File;
207    use std::io::BufReader;
208    use std::path::PathBuf;
209
210    fn get_unmodified_sequence() -> String {
211        let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
212            .join("fixtures/geneious_genbank/unmodified.fa");
213        let mut reader = fasta::io::reader::Builder.build_from_path(path).unwrap();
214        let mut records = reader.records();
215        let record = records.next().unwrap().unwrap();
216        let seq = record.sequence();
217        str::from_utf8(seq.as_ref()).unwrap().to_string()
218    }
219
220    #[test]
221    fn test_error_on_invalid_file() {
222        setup_gen_dir();
223        let conn = &get_connection(None);
224        let db_uuid = metadata::get_db_uuid(conn);
225        let op_conn = &get_operation_connection(None);
226        setup_db(op_conn, &db_uuid);
227        assert_eq!(
228            import_genbank(
229                conn,
230                op_conn,
231                BufReader::new("this is not valid".as_bytes()),
232                None,
233                None,
234                OperationInfo {
235                    file_path: "".to_string(),
236                    file_type: FileTypes::GenBank,
237                    description: "test".to_string(),
238                }
239            ),
240            Err(GenBankError::ParseError(
241                "Failed to parse Syntax error: Error Tag while parsing [this is not valid]"
242                    .to_string()
243            ))
244        )
245    }
246
247    #[test]
248    fn test_records_operation() {
249        setup_gen_dir();
250        let conn = &get_connection(None);
251        let db_uuid = metadata::get_db_uuid(conn);
252        let op_conn = &get_operation_connection(None);
253        setup_db(op_conn, &db_uuid);
254        let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
255            .join("fixtures/geneious_genbank/insertion.gb");
256        let file = File::open(&path).unwrap();
257        let operation = import_genbank(
258            conn,
259            op_conn,
260            BufReader::new(file),
261            None,
262            None,
263            OperationInfo {
264                file_path: path.to_str().unwrap().to_string(),
265                file_type: FileTypes::GenBank,
266                description: "test".to_string(),
267            },
268        )
269        .unwrap();
270        assert_eq!(
271            Operation::get_by_hash(op_conn, &operation.hash).unwrap(),
272            operation
273        );
274    }
275
276    #[test]
277    fn test_creates_sample() {
278        setup_gen_dir();
279        let conn = &get_connection(None);
280        let db_uuid = metadata::get_db_uuid(conn);
281        let op_conn = &get_operation_connection(None);
282        setup_db(op_conn, &db_uuid);
283        let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
284            .join("fixtures/geneious_genbank/insertion.gb");
285        let file = File::open(&path).unwrap();
286        let _ = import_genbank(
287            conn,
288            op_conn,
289            BufReader::new(file),
290            None,
291            "new-sample",
292            OperationInfo {
293                file_path: "".to_string(),
294                file_type: FileTypes::GenBank,
295                description: "test".to_string(),
296            },
297        );
298        assert_eq!(
299            Sample::get_by_name(conn, "new-sample").unwrap().name,
300            "new-sample"
301        );
302    }
303
304    #[cfg(test)]
305    mod geneious_genbanks {
306        use super::*;
307        use crate::normalize_string;
308
309        #[test]
310        fn test_parses_insertion() {
311            setup_gen_dir();
312            // this file has an insertion from 1426-2220
313            let conn = &get_connection(None);
314            let db_uuid = metadata::get_db_uuid(conn);
315            let op_conn = &get_operation_connection(None);
316            setup_db(op_conn, &db_uuid);
317            let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
318                .join("fixtures/geneious_genbank/insertion.gb");
319            let file = File::open(&path).unwrap();
320            let _ = import_genbank(
321                conn,
322                op_conn,
323                BufReader::new(file),
324                None,
325                None,
326                OperationInfo {
327                    file_path: "".to_string(),
328                    file_type: FileTypes::GenBank,
329                    description: "test".to_string(),
330                },
331            );
332            let f = reader::parse_file(&path).unwrap();
333            let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
334            let seqs = BlockGroup::get_all_sequences(conn, 1, false);
335            assert_eq!(
336                seqs,
337                HashSet::from_iter([
338                    seq.clone(),
339                    format!("{}{}", &seq[..1425].to_string(), &seq[2220..].to_string()).to_string()
340                ])
341            );
342        }
343
344        #[test]
345        fn test_parses_deletion() {
346            setup_gen_dir();
347            // this file has a deletion from 765-766
348            let conn = &get_connection(None);
349            let db_uuid = metadata::get_db_uuid(conn);
350            let op_conn = &get_operation_connection(None);
351            setup_db(op_conn, &db_uuid);
352            let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
353                .join("fixtures/geneious_genbank/deletion.gb");
354            let file = File::open(&path).unwrap();
355            let _ = import_genbank(
356                conn,
357                op_conn,
358                BufReader::new(file),
359                None,
360                None,
361                OperationInfo {
362                    file_path: "".to_string(),
363                    file_type: FileTypes::GenBank,
364                    description: "test".to_string(),
365                },
366            );
367            let f = reader::parse_file(&path).unwrap();
368            let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
369            let deleted: String = normalize_string(
370                "TTACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATT
371        CATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCCAGCG
372        GCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGGCGAAGAAGT
373        TGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGGATTGGCTGAGACGA
374        AAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTTTCACCGTAACACGCCACAT
375        CTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGTGGTATTCACTCCAGAGCGATGAAA
376        ACGTTTCAGTTTGCTCATGGAAAACGGTGTAACAAGGGTGAACACTATCCCATATCACCAGCT
377        CACCGTCTTTCATTGCCATACGGAATTCCGGATGAGCATTCATCAGGCGGGCAAGAATGTGAA
378        TAAAGGCCGGATAAAACTTGTGCTTATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCA
379        GCTGAACGGTCTGGTTATAGGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTAC
380        GATGCCATTGGGATATATCAACGGTGGTATATCCAGTGATTTTTTTCTCCAT",
381            );
382            let seqs = BlockGroup::get_all_sequences(conn, 1, false);
383            assert_eq!(
384                seqs,
385                HashSet::from_iter([
386                    seq.clone(),
387                    format!(
388                        "{}{deleted}{}",
389                        &seq[..765].to_string(),
390                        &seq[765..].to_string()
391                    )
392                    .to_string()
393                ])
394            );
395        }
396
397        #[test]
398        fn test_parses_deletion_and_insertion() {
399            setup_gen_dir();
400            let conn = &get_connection(None);
401            let db_uuid = metadata::get_db_uuid(conn);
402            let op_conn = &get_operation_connection(None);
403            setup_db(op_conn, &db_uuid);
404            let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
405                .join("fixtures/geneious_genbank/deletion_and_insertion.gb");
406            let file = File::open(&path).unwrap();
407            let _ = import_genbank(
408                conn,
409                op_conn,
410                BufReader::new(file),
411                None,
412                None,
413                OperationInfo {
414                    file_path: "".to_string(),
415                    file_type: FileTypes::GenBank,
416                    description: "test".to_string(),
417                },
418            );
419            let f = reader::parse_file(&path).unwrap();
420            let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
421            let deleted: String = normalize_string(
422                "TACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTC
423             ATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCC
424             AGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGG
425             CGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGG
426             ATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTT
427             TCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGT
428             GGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACA
429             AGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCC
430             GGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCT
431             TATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATA
432             GGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGAT
433             ATATCAACGGTGGTATATCCAGTGATTTTTTTCTC",
434            );
435            let seqs = BlockGroup::get_all_sequences(conn, 1, false);
436            assert_eq!(
437                seqs,
438                HashSet::from_iter([
439                    seq.clone(),
440                    format!(
441                        "{}{deleted}{}",
442                        &seq[..766].to_string(),
443                        &seq[1557..].to_string()
444                    )
445                    .to_string()
446                ])
447            );
448        }
449
450        #[test]
451        fn test_parses_substitution() {
452            setup_gen_dir();
453            // replacing a sequence ends up with the same result as doing a compound delete + insert
454            // in the above test.
455            let conn = &get_connection(None);
456            let db_uuid = metadata::get_db_uuid(conn);
457            let op_conn = &get_operation_connection(None);
458            setup_db(op_conn, &db_uuid);
459            let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
460                .join("fixtures/geneious_genbank/substitution.gb");
461            let file = File::open(&path).unwrap();
462            let _ = import_genbank(
463                conn,
464                op_conn,
465                BufReader::new(file),
466                None,
467                None,
468                OperationInfo {
469                    file_path: "".to_string(),
470                    file_type: FileTypes::GenBank,
471                    description: "test".to_string(),
472                },
473            );
474            let f = reader::parse_file(&path).unwrap();
475            let seq = str::from_utf8(&f[0].seq).unwrap().to_string();
476            let deleted: String = normalize_string(
477                "TACGCCCCGCCCTGCCACTCATCGCAGTACTGTTGTAATTC
478             ATTAAGCATTCTGCCGACATGGAAGCCATCACAAACGGCATGATGAACCTGAATCGCC
479             AGCGGCATCAGCACCTTGTCGCCTTGCGTATAATATTTGCCCATGGTGAAAACGGGGG
480             CGAAGAAGTTGTCCATATTGGCCACGTTTAAATCAAAACTGGTGAAACTCACCCAGGG
481             ATTGGCTGAGACGAAAAACATATTCTCAATAAACCCTTTAGGGAAATAGGCCAGGTTT
482             TCACCGTAACACGCCACATCTTGCGAATATATGTGTAGAAACTGCCGGAAATCGTCGT
483             GGTATTCACTCCAGAGCGATGAAAACGTTTCAGTTTGCTCATGGAAAACGGTGTAACA
484             AGGGTGAACACTATCCCATATCACCAGCTCACCGTCTTTCATTGCCATACGGAATTCC
485             GGATGAGCATTCATCAGGCGGGCAAGAATGTGAATAAAGGCCGGATAAAACTTGTGCT
486             TATTTTTCTTTACGGTCTTTAAAAAGGCCGTAATATCCAGCTGAACGGTCTGGTTATA
487             GGTACATTGAGCAACTGACTGAAATGCCTCAAAATGTTCTTTACGATGCCATTGGGAT
488             ATATCAACGGTGGTATATCCAGTGATTTTTTTCTC",
489            );
490            let seqs = BlockGroup::get_all_sequences(conn, 1, false);
491            assert_eq!(
492                seqs,
493                HashSet::from_iter([
494                    seq.clone(),
495                    format!(
496                        "{}{deleted}{}",
497                        &seq[..766].to_string(),
498                        &seq[1557..].to_string()
499                    )
500                    .to_string()
501                ])
502            );
503        }
504
505        #[test]
506        fn test_parses_multiple_changes() {
507            setup_gen_dir();
508            let conn = &get_connection(None);
509            let db_uuid = metadata::get_db_uuid(conn);
510            let op_conn = &get_operation_connection(None);
511            setup_db(op_conn, &db_uuid);
512            let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
513                .join("fixtures/geneious_genbank/multiple_insertions_deletions.gb");
514            let file = File::open(&path).unwrap();
515            let _ = import_genbank(
516                conn,
517                op_conn,
518                BufReader::new(file),
519                None,
520                None,
521                OperationInfo {
522                    file_path: "".to_string(),
523                    file_type: FileTypes::GenBank,
524                    description: "test".to_string(),
525                },
526            );
527            // there would be 4! sequences so we just check we have the fully changed and unchanged sequence
528            let f = reader::parse_file(&path).unwrap();
529            let mod_seq = str::from_utf8(&f[0].seq).unwrap().to_string();
530            let sequences: HashSet<String> = BlockGroup::get_all_sequences(conn, 1, false)
531                .iter()
532                .map(|s| s.to_lowercase())
533                .collect();
534            let unchanged_seq = get_unmodified_sequence();
535            assert!(sequences.contains(&mod_seq));
536            assert!(sequences.contains(&unchanged_seq));
537        }
538    }
539}