hgvs/data/
interface.rs

1//! Definition of the interface for accessing the transcript database.
2
3use chrono::NaiveDateTime;
4use indexmap::IndexMap;
5
6use crate::{data::error::Error, sequences::TranslationTable};
7use biocommons_bioutils::assemblies::Assembly;
8
9/// Information about a gene.
10///
11/// ```text
12/// hgnc    | ATM
13/// maploc  | 11q22-q23
14/// descr   | ataxia telangiectasia mutated
15/// summary | The protein encoded by this gene belongs to the PI3/PI4-kinase family. This...
16/// aliases | AT1,ATA,ATC,ATD,ATE,ATDC,TEL1,TELO1
17/// added   | 2014-02-04 21:39:32.57125
18/// ```
19#[derive(Debug, PartialEq, Default, Clone)]
20pub struct GeneInfoRecord {
21    pub hgnc: String,
22    pub maploc: String,
23    pub descr: String,
24    pub summary: String,
25    pub aliases: Vec<String>,
26    pub added: NaiveDateTime,
27}
28
29/// Information about similar transcripts.
30///
31/// ```text
32/// tx_ac1                 | NM_001285829.1
33/// tx_ac2                 | ENST00000341255
34/// hgnc_eq                | f
35/// cds_eq                 | f
36/// es_fp_eq               | f
37/// cds_es_fp_eq           | f
38/// cds_exon_lengths_fp_eq | t
39/// ```
40///
41/// Hint: "es" = "exon set", "fp" = "fingerprint", "eq" = "equal"
42///
43/// "Exon structure" refers to the start and end coordinates on a
44/// specified reference sequence. Thus, having the same exon
45/// structure means that the transcripts are defined on the same
46/// reference sequence and have the same exon spans on that
47/// sequence.
48#[derive(Debug, PartialEq, Default, Clone)]
49pub struct TxSimilarityRecord {
50    /// Accession of first transcript.
51    pub tx_ac1: String,
52    /// Accession of second transcript.
53    pub tx_ac2: String,
54    pub hgnc_eq: bool,
55    /// Whether CDS sequences are identical.
56    pub cds_eq: bool,
57    /// Whether the full exon structures are identical (i.e., incl. UTR).
58    pub es_fp_eq: bool,
59    /// Whether the cds-clipped portions of the exon structures are identical
60    /// (i.e., ecluding. UTR).
61    pub cds_es_fp_eq: bool,
62    pub cds_exon_lengths_fp_eq: bool,
63}
64
65///```text
66/// hgnc            | TGDS
67/// tx_ac           | NM_001304430.1
68/// alt_ac          | NC_000013.10
69/// alt_aln_method  | blat
70/// alt_strand      | -1
71/// ord             | 0
72/// tx_start_i      | 0
73/// tx_end_i        | 301
74/// alt_start_i     | 95248228
75/// alt_end_i       | 95248529
76/// cigar           | 301=
77/// tx_aseq         |
78/// alt_aseq        |
79/// tx_exon_set_id  | 348239
80/// alt_exon_set_id | 722624
81/// tx_exon_id      | 3518579
82/// alt_exon_id     | 6063334
83/// exon_aln_id     | 3461425
84///```
85#[derive(Debug, PartialEq, Default, Clone)]
86pub struct TxExonsRecord {
87    pub hgnc: String,
88    pub tx_ac: String,
89    pub alt_ac: String,
90    pub alt_aln_method: String,
91    pub alt_strand: i16,
92    pub ord: i32,
93    pub tx_start_i: i32,
94    pub tx_end_i: i32,
95    pub alt_start_i: i32,
96    pub alt_end_i: i32,
97    pub cigar: String,
98    pub tx_aseq: Option<String>,
99    pub alt_aseq: Option<String>,
100    pub tx_exon_set_id: i32,
101    pub alt_exon_set_id: i32,
102    pub tx_exon_id: i32,
103    pub alt_exon_id: i32,
104    pub exon_aln_id: i32,
105}
106
107/// ```text
108/// tx_ac          | NM_001304430.2
109/// alt_ac         | NC_000013.10
110/// alt_strand     | -1
111/// alt_aln_method | splign
112/// start_i        | 95226307
113/// end_i          | 95248406
114/// ```
115#[derive(Debug, PartialEq, Default, Clone)]
116pub struct TxForRegionRecord {
117    pub tx_ac: String,
118    pub alt_ac: String,
119    pub alt_strand: i16,
120    pub alt_aln_method: String,
121    pub start_i: i32,
122    pub end_i: i32,
123}
124
125/// ```text
126/// tx_ac          | NM_199425.2
127/// alt_ac         | NM_199425.2
128/// alt_aln_method | transcript
129/// cds_start_i    | 283
130/// cds_end_i      | 1003
131/// lengths        | {707,79,410}
132/// hgnc           | VSX1
133/// ```
134#[derive(Debug, PartialEq, Default, Clone)]
135pub struct TxIdentityInfo {
136    pub tx_ac: String,
137    pub alt_ac: String,
138    pub alt_aln_method: String,
139    pub cds_start_i: i32,
140    pub cds_end_i: i32,
141    pub lengths: Vec<i32>,
142    pub hgnc: String,
143    /// The translation table to use for this transcript.
144    pub translation_table: TranslationTable,
145}
146
147/// ```text
148/// hgnc           | ATM
149/// cds_start_i    | 385
150/// cds_end_i      | 9556
151/// tx_ac          | NM_000051.3
152/// alt_ac         | AC_000143.1
153/// alt_aln_method | splign
154/// ```
155#[derive(Debug, PartialEq, Default, Clone)]
156pub struct TxInfoRecord {
157    pub hgnc: String,
158    pub cds_start_i: Option<i32>,
159    pub cds_end_i: Option<i32>,
160    pub tx_ac: String,
161    pub alt_ac: String,
162    pub alt_aln_method: String,
163}
164
165/// ```text
166/// -[ RECORD 1 ]--+----------------
167/// tx_ac          | ENST00000000233
168/// alt_ac         | NC_000007.13
169/// alt_aln_method | genebuild
170/// -[ RECORD 2 ]--+----------------
171/// tx_ac          | ENST00000000412
172/// alt_ac         | NC_000012.11
173/// alt_aln_method | genebuild
174/// ```
175#[derive(Debug, PartialEq, Default, Clone)]
176pub struct TxMappingOptionsRecord {
177    pub tx_ac: String,
178    pub alt_ac: String,
179    pub alt_aln_method: String,
180}
181
182/// Interface for data providers.
183pub trait Provider {
184    /// Return the data version, e.g., `uta_20210129`.
185    fn data_version(&self) -> &str;
186
187    /// Return the schema version, e.g., `"1.1"`.
188    fn schema_version(&self) -> &str;
189
190    /// Return a map from accession to chromosome name for the given assembly
191    ///
192    /// For example, when `assembly_name = "GRCh38.p5"`, the value for `"NC_000001.11"`
193    /// would be `"1"`.
194    ///
195    /// # Arguments
196    ///
197    /// * `assembly` - The assembly to build the map for.
198    fn get_assembly_map(&self, assembly: Assembly) -> IndexMap<String, String>;
199
200    /// Returns the basic information about the gene.
201    ///
202    /// # Arguments
203    ///
204    /// * `hgnc` - HGNC gene name
205    fn get_gene_info(&self, hgnc: &str) -> Result<GeneInfoRecord, Error>;
206
207    /// Return the (single) associated protein accession for a given transcript accession,
208    /// or None if not found.
209    ///
210    /// # Arguments
211    ///
212    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
213    fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result<Option<String>, Error>;
214
215    /// Return full sequence for the given accession.
216    ///
217    /// # Arguments
218    ///
219    /// * `ac` -- accession
220    fn get_seq(&self, ac: &str) -> Result<String, Error> {
221        self.get_seq_part(ac, None, None)
222    }
223
224    /// Return sequence part for the given accession.
225    ///
226    /// # Arguments
227    ///
228    /// * `ac` -- accession
229    /// * `start` -- start position (0-based, start of sequence if missing)
230    /// * `end` -- end position (0-based, end of sequence if missing)
231    fn get_seq_part(
232        &self,
233        ac: &str,
234        begin: Option<usize>,
235        end: Option<usize>,
236    ) -> Result<String, Error>;
237
238    /// Returns a list of protein accessions for a given sequence.
239    ///
240    /// The list is guaranteed to contain at least one element with the MD5-based accession
241    /// (MD5_01234abc..def56789) at the end of the list.
242    fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, Error>;
243
244    /// Return a list of transcripts that are similar to the given transcript, with relevant
245    /// similarity criteria.
246    ///
247    /// # Arguments
248    ///
249    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
250    fn get_similar_transcripts(&self, tx_ac: &str) -> Result<Vec<TxSimilarityRecord>, Error>;
251
252    /// Return transcript exon info for supplied accession (tx_ac, alt_ac, alt_aln_method),
253    /// or empty `Vec` if not found.
254    ///
255    /// # Arguments
256    ///
257    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
258    /// * `alt_ac` -- specific genomic sequence (e.g., NC_000011.4)
259    /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat)
260    fn get_tx_exons(
261        &self,
262        tx_ac: &str,
263        alt_ac: &str,
264        alt_aln_method: &str,
265    ) -> Result<Vec<TxExonsRecord>, Error>;
266
267    /// Return transcript info records for supplied gene, in order of decreasing length.
268    ///
269    /// # Arguments
270    ///
271    /// * `gene` - HGNC gene name
272    fn get_tx_for_gene(&self, gene: &str) -> Result<Vec<TxInfoRecord>, Error>;
273
274    /// Return transcripts that overlap given region.
275    ///
276    /// # Arguments
277    ///
278    // * `alt_ac` -- reference sequence (e.g., NC_000007.13)
279    // * `alt_aln_method` -- alignment method (e.g., splign)
280    // * `start_i` -- 5' bound of region
281    // * `end_i` -- 3' bound of region
282    fn get_tx_for_region(
283        &self,
284        alt_ac: &str,
285        alt_aln_method: &str,
286        start_i: i32,
287        end_i: i32,
288    ) -> Result<Vec<TxForRegionRecord>, Error>;
289
290    /// Return features associated with a single transcript.
291    ///
292    /// # Arguments
293    ///
294    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_199425.2')
295    fn get_tx_identity_info(&self, tx_ac: &str) -> Result<TxIdentityInfo, Error>;
296
297    /// Return a single transcript info for supplied accession (tx_ac, alt_ac, alt_aln_method), or None if not found.
298    ///
299    /// # Arguments
300    ///
301    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
302    /// * `alt_ac -- specific genomic sequence (e.g., NC_000011.4)
303    /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat)
304    fn get_tx_info(
305        &self,
306        tx_ac: &str,
307        alt_ac: &str,
308        alt_aln_method: &str,
309    ) -> Result<TxInfoRecord, Error>;
310
311    /// Return all transcript alignment sets for a given transcript accession (tx_ac).
312    ///
313    /// Returns empty list if transcript does not exist.  Use this method to discovery
314    /// possible mapping options supported in the database.
315    ///
316    /// # Arguments
317    ///
318    /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
319    fn get_tx_mapping_options(&self, tx_ac: &str) -> Result<Vec<TxMappingOptionsRecord>, Error>;
320}
321
322// <LICENSE>
323// Copyright 2023 hgvs-rs Contributors
324// Copyright 2014 Bioutils Contributors
325//
326// Licensed under the Apache License, Version 2.0 (the "License");
327// you may not use this file except in compliance with the License.
328// You may obtain a copy of the License at
329//
330//     http://www.apache.org/licenses/LICENSE-2.0
331//
332// Unless required by applicable law or agreed to in writing, software
333// distributed under the License is distributed on an "AS IS" BASIS,
334// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
335// See the License for the specific language governing permissions and
336// limitations under the License.
337// </LICENSE>