ncbi 0.2.0-beta

Rust data structures for NCBI APIs
Documentation
use std::hash::Hash;
use crate::general::DbTag;
use crate::seqfeat::{BioSource, GeneRef, ProtRef, RnaRef};
use crate::seqloc::SeqLoc;

#[allow(non_snake_case)]
pub enum GeneType {
    Unknown,
    TRNA,
    RRNA,
    SnRNA,
    ScRNA,
    SnoRNA,
    ProteinCoding,
    Pseudo,
    Transposon,
    MiscRNA,
    NcRNA,
    BiologicalRegion,
    Other
}

pub struct EntrezGene {
    /// not in submission, but in retrieval
    track_info: Option<Vec<GeneTrack>>,
    /// type of gene
    r#type: GeneType,
    source: BioSource,
    /// Expected to be unique per organism (tax_id)
    gene: GeneRef,
    /// Expected to be unique over all organisms
    prot: Option<ProtRef>,
    /// Expected to be unique over all organisms
    rna: Option<RnaRef>,
    /// Short summary
    summary: Option<String>,
    location: Vec<(u64, u64)>,
    /// NCBI source to Entrez
    gene_source: Option<Vec<GeneSource>>,
    /// Location of gene on chromosome (if known)
    /// and all information about products
    /// (mRNA, proteins, and so on)
    locus: Option<Vec<GeneCommentary>>,
    properties: Option<Vec<GeneCommentary>>,
    homology: Option<Vec<GeneCommentary>>,
    comments: Option<Vec<GeneCommentary>>,
    unique_keys: Option<Vec<DbTag>>,
    xtra_index_terms: Option<Vec<String>>,
    xtra_properties: Option<Vec<XtraTerms>>,
    xtra_iq: Option<Vec<XtraTerms>>,
    non_unique_keys: Option<Vec<DbTag>>,
}

pub type GeneSet = Vec<EntrezGene>;

#[derive(Default)]
pub enum GeneStatus {
    #[default]
    Live,
    Secondary,
    Discontinued
}

pub struct GeneTrack {
    /// Required unique document id
    geneid: u64,
    status: GeneStatus,
    current_id: Option<Vec<DbTag>>,
    create_date: u64,
    update_date: u64,
    discontinued_date: Option<u64>
}

#[derive(Default)]
pub struct GeneSource {
    /// key to the source within NCBI locuslink, e. coli, etc
    src: String,
    /// eg. locuslink id
    src_int: Option<i64>,
    /// eg. chromosome1
    src_str1: Option<String>,
    /// expected to be unique per organism (tax_id)
    src_str2: Option<String>,
    /// do we have a URL for gene display?
    /// TODO: should default to `false`
    gene_display: bool,
    /// do we have a URL for map/locus display?
    /// TODO: should default to `false`
    locus_display: bool,
    /// do we have a URL for extra indexing terms?
    /// TODO: should default to `false`
    extra_terms: bool,
}

#[allow(non_camel_case_types)]
pub enum GeneCommentaryType {
    Genomic,
    PreRNA,
    mRNA,
    rRNA,
    tRNA,
    snRNA,
    scRNA,
    Peptide,
    OtherGenetic,
    Genomic_mRNA,
    cRNA,
    MaturePeptide,
    PreProtein,
    MiscRNA,
    snoRNA,
    /// Used to display tag/value pair
    /// for this type label is used as property tag,
    /// text is used as property value,
    /// other fields are not used
    Property,
    /// Currently not used
    Reference,
    /// To include generif in the main blob
    Generif,
    /// Used to display phenotype information
    Phenotype,
    /// Used (but not limited) to identify resulting interaction complexes
    Complex,
    /// PubChem entities
    Compound,
    ncRNA,
    /// For relationship sets (such as pseudo-gene / parent gene)
    GeneGroup,
    /// For full assembly accession
    Assembly,
    /// For the assembly unit corresponding to the refseq
    AssemblyUnit,
    CRegion,
    DSegment,
    JSegment,
    VSegment,
    Comment,
    Other,
}

pub struct GeneCommentary {
    r#type: GeneCommentaryType,
    /// appears above text
    heading: Option<String>,
    /// occurs to left of text for protein and RNA types it is a name
    /// for property type it is a property tag
    label: Option<String>,
    /// block of text
    /// for property type, it is a property value
    text: Option<String>,
    /// accession for the gi in the seqLoc
    /// expected to be unique over all organisms
    accession: Option<String>,
    /// version for the accession above
    version: Option<u64>,
    /// are tag/value pairs of properties/fields as defined in the Entrez
    /// database (i.e.: UNIGENE/Hs.74561)
    xtra_properties: Option<Vec<XtraTerms>>,
    /// refs for this
    refs: Option<Vec<Pub>>,
    /// links and refs
    source: Option<Vec<OtherSource>>,
    /// referenced sequences in genomic coordinates
    genomic_coords: Option<Vec<SeqLoc>>,
    /// referenced sequences in non-genomic coordinates
    seqs: Option<VEc<SeqLoc>>,
    products: Option<Vec<Self>>,
    properties: Option<Vec<Self>>,
    comment: Option<Vec<Self>>,
    create_date: Option<u64>,
    update_date: Option<u64>,
    rna: Option<RNARef>,
}

pub struct OtherSource {
    /// key to non-ncbi source
    src: Option<DbTag>,
    /// text before anchore
    pre_text: Option<String>,
    /// text to show as highlight
    anchor: Option<String>,
    /// if present, use this URL not DbTag and database
    url: Option<String>,
    /// text after anchor
    post_text: Option<String>,
}

#[allow(non_camel_case_types)]
pub enum MapType {
    Cyto,
    bp,
    cM,
    cR,
    Min,
}

pub enum MapMethod {
    /// url to non-mapviewer mapviewing resource
    Proxy(String),
    /// units used in display-str to query mapviewer
    MapType(MapType),
}

pub struct Map {
    display_str: String,
    method: MapMethod,
}

pub struct XtraTerms {
    tag: String,
    value: String
}