1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
//! Definition of the interface for accessing the transcript database.

use chrono::NaiveDateTime;
use indexmap::IndexMap;

use crate::data::error::Error;
use crate::static_data::Assembly;

/// Information about a gene.
///
/// ```text
/// hgnc    | ATM
/// maploc  | 11q22-q23
/// descr   | ataxia telangiectasia mutated
/// summary | The protein encoded by this gene belongs to the PI3/PI4-kinase family. This...
/// aliases | AT1,ATA,ATC,ATD,ATE,ATDC,TEL1,TELO1
/// added   | 2014-02-04 21:39:32.57125
/// ```
#[derive(Debug, PartialEq, Default, Clone)]
pub struct GeneInfoRecord {
    pub hgnc: String,
    pub maploc: String,
    pub descr: String,
    pub summary: String,
    pub aliases: Vec<String>,
    pub added: NaiveDateTime,
}

/// Information about similar transcripts.
///
/// ```text
/// tx_ac1                 | NM_001285829.1
/// tx_ac2                 | ENST00000341255
/// hgnc_eq                | f
/// cds_eq                 | f
/// es_fp_eq               | f
/// cds_es_fp_eq           | f
/// cds_exon_lengths_fp_eq | t
/// ```
///
/// Hint: "es" = "exon set", "fp" = "fingerprint", "eq" = "equal"
///
/// "Exon structure" refers to the start and end coordinates on a
/// specified reference sequence. Thus, having the same exon
/// structure means that the transcripts are defined on the same
/// reference sequence and have the same exon spans on that
/// sequence.
#[derive(Debug, PartialEq, Default, Clone)]
pub struct TxSimilarityRecord {
    /// Accession of first transcript.
    pub tx_ac1: String,
    /// Accession of second transcript.
    pub tx_ac2: String,
    pub hgnc_eq: bool,
    /// Whether CDS sequences are identical.
    pub cds_eq: bool,
    /// Whether the full exon structures are identical (i.e., incl. UTR).
    pub es_fp_eq: bool,
    /// Whether the cds-clipped portions of the exon structures are identical
    /// (i.e., ecluding. UTR).
    pub cds_es_fp_eq: bool,
    pub cds_exon_lengths_fp_eq: bool,
}

///```text
/// hgnc            | TGDS
/// tx_ac           | NM_001304430.1
/// alt_ac          | NC_000013.10
/// alt_aln_method  | blat
/// alt_strand      | -1
/// ord             | 0
/// tx_start_i      | 0
/// tx_end_i        | 301
/// alt_start_i     | 95248228
/// alt_end_i       | 95248529
/// cigar           | 301=
/// tx_aseq         |
/// alt_aseq        |
/// tx_exon_set_id  | 348239
/// alt_exon_set_id | 722624
/// tx_exon_id      | 3518579
/// alt_exon_id     | 6063334
/// exon_aln_id     | 3461425
///```
#[derive(Debug, PartialEq, Default, Clone)]
pub struct TxExonsRecord {
    pub hgnc: String,
    pub tx_ac: String,
    pub alt_ac: String,
    pub alt_aln_method: String,
    pub alt_strand: i16,
    pub ord: i32,
    pub tx_start_i: i32,
    pub tx_end_i: i32,
    pub alt_start_i: i32,
    pub alt_end_i: i32,
    pub cigar: String,
    pub tx_aseq: Option<String>,
    pub alt_aseq: Option<String>,
    pub tx_exon_set_id: i32,
    pub alt_exon_set_id: i32,
    pub tx_exon_id: i32,
    pub alt_exon_id: i32,
    pub exon_aln_id: i32,
}

/// ```text
/// tx_ac          | NM_001304430.2
/// alt_ac         | NC_000013.10
/// alt_strand     | -1
/// alt_aln_method | splign
/// start_i        | 95226307
/// end_i          | 95248406
/// ```
#[derive(Debug, PartialEq, Default, Clone)]
pub struct TxForRegionRecord {
    pub tx_ac: String,
    pub alt_ac: String,
    pub alt_strand: i16,
    pub alt_aln_method: String,
    pub start_i: i32,
    pub end_i: i32,
}

/// ```text
/// tx_ac          | NM_199425.2
/// alt_ac         | NM_199425.2
/// alt_aln_method | transcript
/// cds_start_i    | 283
/// cds_end_i      | 1003
/// lengths        | {707,79,410}
/// hgnc           | VSX1
/// ```
#[derive(Debug, PartialEq, Default, Clone)]
pub struct TxIdentityInfo {
    pub tx_ac: String,
    pub alt_ac: String,
    pub alt_aln_method: String,
    pub cds_start_i: i32,
    pub cds_end_i: i32,
    pub lengths: Vec<i32>,
    pub hgnc: String,
}

/// ```text
/// hgnc           | ATM
/// cds_start_i    | 385
/// cds_end_i      | 9556
/// tx_ac          | NM_000051.3
/// alt_ac         | AC_000143.1
/// alt_aln_method | splign
/// ```
#[derive(Debug, PartialEq, Default, Clone)]
pub struct TxInfoRecord {
    pub hgnc: String,
    pub cds_start_i: Option<i32>,
    pub cds_end_i: Option<i32>,
    pub tx_ac: String,
    pub alt_ac: String,
    pub alt_aln_method: String,
}

/// ```text
/// -[ RECORD 1 ]--+----------------
/// tx_ac          | ENST00000000233
/// alt_ac         | NC_000007.13
/// alt_aln_method | genebuild
/// -[ RECORD 2 ]--+----------------
/// tx_ac          | ENST00000000412
/// alt_ac         | NC_000012.11
/// alt_aln_method | genebuild
/// ```
#[derive(Debug, PartialEq, Default, Clone)]
pub struct TxMappingOptionsRecord {
    pub tx_ac: String,
    pub alt_ac: String,
    pub alt_aln_method: String,
}

/// Interface for data providers.
pub trait Provider {
    /// Return the data version, e.g., `uta_20210129`.
    fn data_version(&self) -> &str;

    /// Return the schema version, e.g., `"1.1"`.
    fn schema_version(&self) -> &str;

    /// Return a map from accession to chromosome name for the given assembly
    ///
    /// For example, when `assembly_name = "GRCh38.p5"`, the value for `"NC_000001.11"`
    /// would be `"1"`.
    ///
    /// # Arguments
    ///
    /// * `assembly` - The assembly to build the map for.
    fn get_assembly_map(&self, assembly: Assembly) -> IndexMap<String, String>;

    /// Returns the basic information about the gene.
    ///
    /// # Arguments
    ///
    /// * `hgnc` - HGNC gene name
    fn get_gene_info(&self, hgnc: &str) -> Result<GeneInfoRecord, Error>;

    /// Return the (single) associated protein accession for a given transcript accession,
    /// or None if not found.
    ///
    /// # Arguments
    ///
    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
    fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result<Option<String>, Error>;

    /// Return full sequence for the given accession.
    ///
    /// # Arguments
    ///
    /// * `ac` -- accession
    fn get_seq(&self, ac: &str) -> Result<String, Error> {
        self.get_seq_part(ac, None, None)
    }

    /// Return sequence part for the given accession.
    ///
    /// # Arguments
    ///
    /// * `ac` -- accession
    /// * `start` -- start position (0-based, start of sequence if missing)
    /// * `end` -- end position (0-based, end of sequence if missing)
    fn get_seq_part(
        &self,
        ac: &str,
        begin: Option<usize>,
        end: Option<usize>,
    ) -> Result<String, Error>;

    /// Returns a list of protein accessions for a given sequence.
    ///
    /// The list is guaranteed to contain at least one element with the MD5-based accession
    /// (MD5_01234abc..def56789) at the end of the list.
    fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, Error>;

    /// Return a list of transcripts that are similar to the given transcript, with relevant
    /// similarity criteria.
    ///
    /// # Arguments
    ///
    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
    fn get_similar_transcripts(&self, tx_ac: &str) -> Result<Vec<TxSimilarityRecord>, Error>;

    /// Return transcript exon info for supplied accession (tx_ac, alt_ac, alt_aln_method),
    /// or empty `Vec` if not found.
    ///
    /// # Arguments
    ///
    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
    /// * `alt_ac` -- specific genomic sequence (e.g., NC_000011.4)
    /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat)
    fn get_tx_exons(
        &self,
        tx_ac: &str,
        alt_ac: &str,
        alt_aln_method: &str,
    ) -> Result<Vec<TxExonsRecord>, Error>;

    /// Return transcript info records for supplied gene, in order of decreasing length.
    ///
    /// # Arguments
    ///
    /// * `gene` - HGNC gene name
    fn get_tx_for_gene(&self, gene: &str) -> Result<Vec<TxInfoRecord>, Error>;

    /// Return transcripts that overlap given region.
    ///
    /// # Arguments
    ///
    // * `alt_ac` -- reference sequence (e.g., NC_000007.13)
    // * `alt_aln_method` -- alignment method (e.g., splign)
    // * `start_i` -- 5' bound of region
    // * `end_i` -- 3' bound of region
    fn get_tx_for_region(
        &self,
        alt_ac: &str,
        alt_aln_method: &str,
        start_i: i32,
        end_i: i32,
    ) -> Result<Vec<TxForRegionRecord>, Error>;

    /// Return features associated with a single transcript.
    ///
    /// # Arguments
    ///
    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_199425.2')
    fn get_tx_identity_info(&self, tx_ac: &str) -> Result<TxIdentityInfo, Error>;

    /// Return a single transcript info for supplied accession (tx_ac, alt_ac, alt_aln_method), or None if not found.
    ///
    /// # Arguments
    ///
    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
    /// * `alt_ac -- specific genomic sequence (e.g., NC_000011.4)
    /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat)
    fn get_tx_info(
        &self,
        tx_ac: &str,
        alt_ac: &str,
        alt_aln_method: &str,
    ) -> Result<TxInfoRecord, Error>;

    /// Return all transcript alignment sets for a given transcript accession (tx_ac).
    ///
    /// Returns empty list if transcript does not exist.  Use this method to discovery
    /// possible mapping options supported in the database.
    ///
    /// # Arguments
    ///
    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
    fn get_tx_mapping_options(&self, tx_ac: &str) -> Result<Vec<TxMappingOptionsRecord>, Error>;
}

// <LICENSE>
// Copyright 2023 hgvs-rs Contributors
// Copyright 2014 Bioutils Contributors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// </LICENSE>