hgvs/data/interface.rs
1//! Definition of the interface for accessing the transcript database.
2
3use chrono::NaiveDateTime;
4use indexmap::IndexMap;
5
6use crate::{data::error::Error, sequences::TranslationTable};
7use biocommons_bioutils::assemblies::Assembly;
8
9/// Information about a gene.
10///
11/// ```text
12/// hgnc | ATM
13/// maploc | 11q22-q23
14/// descr | ataxia telangiectasia mutated
15/// summary | The protein encoded by this gene belongs to the PI3/PI4-kinase family. This...
16/// aliases | AT1,ATA,ATC,ATD,ATE,ATDC,TEL1,TELO1
17/// added | 2014-02-04 21:39:32.57125
18/// ```
19#[derive(Debug, PartialEq, Default, Clone)]
20pub struct GeneInfoRecord {
21 pub hgnc: String,
22 pub maploc: String,
23 pub descr: String,
24 pub summary: String,
25 pub aliases: Vec<String>,
26 pub added: NaiveDateTime,
27}
28
29/// Information about similar transcripts.
30///
31/// ```text
32/// tx_ac1 | NM_001285829.1
33/// tx_ac2 | ENST00000341255
34/// hgnc_eq | f
35/// cds_eq | f
36/// es_fp_eq | f
37/// cds_es_fp_eq | f
38/// cds_exon_lengths_fp_eq | t
39/// ```
40///
41/// Hint: "es" = "exon set", "fp" = "fingerprint", "eq" = "equal"
42///
43/// "Exon structure" refers to the start and end coordinates on a
44/// specified reference sequence. Thus, having the same exon
45/// structure means that the transcripts are defined on the same
46/// reference sequence and have the same exon spans on that
47/// sequence.
48#[derive(Debug, PartialEq, Default, Clone)]
49pub struct TxSimilarityRecord {
50 /// Accession of first transcript.
51 pub tx_ac1: String,
52 /// Accession of second transcript.
53 pub tx_ac2: String,
54 pub hgnc_eq: bool,
55 /// Whether CDS sequences are identical.
56 pub cds_eq: bool,
57 /// Whether the full exon structures are identical (i.e., incl. UTR).
58 pub es_fp_eq: bool,
59 /// Whether the cds-clipped portions of the exon structures are identical
60 /// (i.e., ecluding. UTR).
61 pub cds_es_fp_eq: bool,
62 pub cds_exon_lengths_fp_eq: bool,
63}
64
65///```text
66/// hgnc | TGDS
67/// tx_ac | NM_001304430.1
68/// alt_ac | NC_000013.10
69/// alt_aln_method | blat
70/// alt_strand | -1
71/// ord | 0
72/// tx_start_i | 0
73/// tx_end_i | 301
74/// alt_start_i | 95248228
75/// alt_end_i | 95248529
76/// cigar | 301=
77/// tx_aseq |
78/// alt_aseq |
79/// tx_exon_set_id | 348239
80/// alt_exon_set_id | 722624
81/// tx_exon_id | 3518579
82/// alt_exon_id | 6063334
83/// exon_aln_id | 3461425
84///```
85#[derive(Debug, PartialEq, Default, Clone)]
86pub struct TxExonsRecord {
87 pub hgnc: String,
88 pub tx_ac: String,
89 pub alt_ac: String,
90 pub alt_aln_method: String,
91 pub alt_strand: i16,
92 pub ord: i32,
93 pub tx_start_i: i32,
94 pub tx_end_i: i32,
95 pub alt_start_i: i32,
96 pub alt_end_i: i32,
97 pub cigar: String,
98 pub tx_aseq: Option<String>,
99 pub alt_aseq: Option<String>,
100 pub tx_exon_set_id: i32,
101 pub alt_exon_set_id: i32,
102 pub tx_exon_id: i32,
103 pub alt_exon_id: i32,
104 pub exon_aln_id: i32,
105}
106
107/// ```text
108/// tx_ac | NM_001304430.2
109/// alt_ac | NC_000013.10
110/// alt_strand | -1
111/// alt_aln_method | splign
112/// start_i | 95226307
113/// end_i | 95248406
114/// ```
115#[derive(Debug, PartialEq, Default, Clone)]
116pub struct TxForRegionRecord {
117 pub tx_ac: String,
118 pub alt_ac: String,
119 pub alt_strand: i16,
120 pub alt_aln_method: String,
121 pub start_i: i32,
122 pub end_i: i32,
123}
124
125/// ```text
126/// tx_ac | NM_199425.2
127/// alt_ac | NM_199425.2
128/// alt_aln_method | transcript
129/// cds_start_i | 283
130/// cds_end_i | 1003
131/// lengths | {707,79,410}
132/// hgnc | VSX1
133/// ```
134#[derive(Debug, PartialEq, Default, Clone)]
135pub struct TxIdentityInfo {
136 pub tx_ac: String,
137 pub alt_ac: String,
138 pub alt_aln_method: String,
139 pub cds_start_i: i32,
140 pub cds_end_i: i32,
141 pub lengths: Vec<i32>,
142 pub hgnc: String,
143 /// The translation table to use for this transcript.
144 pub translation_table: TranslationTable,
145}
146
147/// ```text
148/// hgnc | ATM
149/// cds_start_i | 385
150/// cds_end_i | 9556
151/// tx_ac | NM_000051.3
152/// alt_ac | AC_000143.1
153/// alt_aln_method | splign
154/// ```
155#[derive(Debug, PartialEq, Default, Clone)]
156pub struct TxInfoRecord {
157 pub hgnc: String,
158 pub cds_start_i: Option<i32>,
159 pub cds_end_i: Option<i32>,
160 pub tx_ac: String,
161 pub alt_ac: String,
162 pub alt_aln_method: String,
163}
164
165/// ```text
166/// -[ RECORD 1 ]--+----------------
167/// tx_ac | ENST00000000233
168/// alt_ac | NC_000007.13
169/// alt_aln_method | genebuild
170/// -[ RECORD 2 ]--+----------------
171/// tx_ac | ENST00000000412
172/// alt_ac | NC_000012.11
173/// alt_aln_method | genebuild
174/// ```
175#[derive(Debug, PartialEq, Default, Clone)]
176pub struct TxMappingOptionsRecord {
177 pub tx_ac: String,
178 pub alt_ac: String,
179 pub alt_aln_method: String,
180}
181
182/// Interface for data providers.
183pub trait Provider {
184 /// Return the data version, e.g., `uta_20210129`.
185 fn data_version(&self) -> &str;
186
187 /// Return the schema version, e.g., `"1.1"`.
188 fn schema_version(&self) -> &str;
189
190 /// Return a map from accession to chromosome name for the given assembly
191 ///
192 /// For example, when `assembly_name = "GRCh38.p5"`, the value for `"NC_000001.11"`
193 /// would be `"1"`.
194 ///
195 /// # Arguments
196 ///
197 /// * `assembly` - The assembly to build the map for.
198 fn get_assembly_map(&self, assembly: Assembly) -> IndexMap<String, String>;
199
200 /// Returns the basic information about the gene.
201 ///
202 /// # Arguments
203 ///
204 /// * `hgnc` - HGNC gene name
205 fn get_gene_info(&self, hgnc: &str) -> Result<GeneInfoRecord, Error>;
206
207 /// Return the (single) associated protein accession for a given transcript accession,
208 /// or None if not found.
209 ///
210 /// # Arguments
211 ///
212 /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
213 fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result<Option<String>, Error>;
214
215 /// Return full sequence for the given accession.
216 ///
217 /// # Arguments
218 ///
219 /// * `ac` -- accession
220 fn get_seq(&self, ac: &str) -> Result<String, Error> {
221 self.get_seq_part(ac, None, None)
222 }
223
224 /// Return sequence part for the given accession.
225 ///
226 /// # Arguments
227 ///
228 /// * `ac` -- accession
229 /// * `start` -- start position (0-based, start of sequence if missing)
230 /// * `end` -- end position (0-based, end of sequence if missing)
231 fn get_seq_part(
232 &self,
233 ac: &str,
234 begin: Option<usize>,
235 end: Option<usize>,
236 ) -> Result<String, Error>;
237
238 /// Returns a list of protein accessions for a given sequence.
239 ///
240 /// The list is guaranteed to contain at least one element with the MD5-based accession
241 /// (MD5_01234abc..def56789) at the end of the list.
242 fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, Error>;
243
244 /// Return a list of transcripts that are similar to the given transcript, with relevant
245 /// similarity criteria.
246 ///
247 /// # Arguments
248 ///
249 /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
250 fn get_similar_transcripts(&self, tx_ac: &str) -> Result<Vec<TxSimilarityRecord>, Error>;
251
252 /// Return transcript exon info for supplied accession (tx_ac, alt_ac, alt_aln_method),
253 /// or empty `Vec` if not found.
254 ///
255 /// # Arguments
256 ///
257 /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
258 /// * `alt_ac` -- specific genomic sequence (e.g., NC_000011.4)
259 /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat)
260 fn get_tx_exons(
261 &self,
262 tx_ac: &str,
263 alt_ac: &str,
264 alt_aln_method: &str,
265 ) -> Result<Vec<TxExonsRecord>, Error>;
266
267 /// Return transcript info records for supplied gene, in order of decreasing length.
268 ///
269 /// # Arguments
270 ///
271 /// * `gene` - HGNC gene name
272 fn get_tx_for_gene(&self, gene: &str) -> Result<Vec<TxInfoRecord>, Error>;
273
274 /// Return transcripts that overlap given region.
275 ///
276 /// # Arguments
277 ///
278 // * `alt_ac` -- reference sequence (e.g., NC_000007.13)
279 // * `alt_aln_method` -- alignment method (e.g., splign)
280 // * `start_i` -- 5' bound of region
281 // * `end_i` -- 3' bound of region
282 fn get_tx_for_region(
283 &self,
284 alt_ac: &str,
285 alt_aln_method: &str,
286 start_i: i32,
287 end_i: i32,
288 ) -> Result<Vec<TxForRegionRecord>, Error>;
289
290 /// Return features associated with a single transcript.
291 ///
292 /// # Arguments
293 ///
294 /// * `tx_ac` -- transcript accession with version (e.g., 'NM_199425.2')
295 fn get_tx_identity_info(&self, tx_ac: &str) -> Result<TxIdentityInfo, Error>;
296
297 /// Return a single transcript info for supplied accession (tx_ac, alt_ac, alt_aln_method), or None if not found.
298 ///
299 /// # Arguments
300 ///
301 /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
302 /// * `alt_ac -- specific genomic sequence (e.g., NC_000011.4)
303 /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat)
304 fn get_tx_info(
305 &self,
306 tx_ac: &str,
307 alt_ac: &str,
308 alt_aln_method: &str,
309 ) -> Result<TxInfoRecord, Error>;
310
311 /// Return all transcript alignment sets for a given transcript accession (tx_ac).
312 ///
313 /// Returns empty list if transcript does not exist. Use this method to discovery
314 /// possible mapping options supported in the database.
315 ///
316 /// # Arguments
317 ///
318 /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3')
319 fn get_tx_mapping_options(&self, tx_ac: &str) -> Result<Vec<TxMappingOptionsRecord>, Error>;
320}
321
322// <LICENSE>
323// Copyright 2023 hgvs-rs Contributors
324// Copyright 2014 Bioutils Contributors
325//
326// Licensed under the Apache License, Version 2.0 (the "License");
327// you may not use this file except in compliance with the License.
328// You may obtain a copy of the License at
329//
330// http://www.apache.org/licenses/LICENSE-2.0
331//
332// Unless required by applicable law or agreed to in writing, software
333// distributed under the License is distributed on an "AS IS" BASIS,
334// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
335// See the License for the specific language governing permissions and
336// limitations under the License.
337// </LICENSE>