chiral_db_sources/
chembl.rs

1//! Database ChEMBL
2//! https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/
3//! 
4//! # How to use
5//!
6//! ``` 
7//! use chiral_db_sources::chembl::SourceChembl;
8//! 
9//! let filepath = std::path::Path::new("../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt");
10//! let sc = SourceChembl::new(&filepath);
11//! assert_eq!(sc.len(), 100);
12//! let ec = sc.get(&String::from("CHEMBL503634")).unwrap();
13//! assert_eq!(ec.smiles, "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O");    
14//! assert_eq!(ec.inchi, "InChI=1S/C15H15NO5/c1-21-15-12(19)7-11(18)13(14(15)20)10(16)6-8-2-4-9(17)5-3-8/h2-5,7,16-20H,6H2,1H3");
15//! assert_eq!(ec.inchi_key, "OPELSESCRGGKAM-UHFFFAOYSA-N");
16//! 
17//! let data_all = sc.get_all();
18//! assert_eq!(data_all.keys().count(), 100);
19//! 
20//! let selected = sc.choices(10);
21//! assert_eq!(selected.len(), 10);
22//! ```
23
24use std::io::prelude::*;
25use rand::prelude::*;
26
27type ChemblID = String;
28type CanonicalSMILES = String;
29type StandardInchi = String;
30type StandardInchiKey = String;
31
32pub struct EntryChembl {
33    pub chembl_id: ChemblID,
34    pub smiles: CanonicalSMILES,
35    pub inchi: StandardInchi,
36    pub inchi_key: StandardInchiKey
37}
38
39impl EntryChembl {
40    pub fn new(v: Vec<&str>) -> Self {
41        let (chembl_id, smiles, inchi, inchi_key) = (String::from(v[0]), String::from(v[1]), String::from(v[2]), String::from(v[3]));
42        Self { chembl_id, smiles, inchi, inchi_key }
43    }
44}
45
46type DataChembl = std::collections::HashMap<String, EntryChembl>;
47
48pub struct SourceChembl {
49    data: DataChembl 
50}
51
52impl SourceChembl {
53    pub fn new(filepath: &std::path::Path) -> Self {
54        let mut sc = Self { data: DataChembl::new() };
55        sc.load(filepath);
56        sc
57    }
58
59    pub fn new_default() -> Self {
60        let mut sc = Self { data: DataChembl::new() };
61        let chembl_txt = std::env::var_os("CHIRAL_DB_CHEMBL_TXT").expect("CHIRAL_DB_CHEMBL_TXT to be set as the default source");
62        sc.load(&std::path::Path::new(&chembl_txt));
63        sc
64    }
65
66    fn sanitize(&mut self) {
67        self.data.remove("chembl_id");
68    }
69
70    pub fn load(&mut self, filepath: &std::path::Path) {
71        self.data.clear();
72
73        let file = std::fs::File::open(filepath).unwrap();
74        let reader = std::io::BufReader::new(file);
75        self.data = reader.lines()
76            .map(|l| {
77                    let line = l.unwrap();
78                    let v = line.as_str().split('\t').collect::<Vec<&str>>();
79                    (String::from(v[0]), EntryChembl::new(v))
80                }
81            )
82            .collect::<Vec<(ChemblID, EntryChembl)>>()
83            .into_iter()
84            .collect();
85
86        self.sanitize();
87    }
88
89    pub fn get(&self, id: &ChemblID) -> Option<&EntryChembl> {
90        self.data.get(id)
91    }
92
93    pub fn get_all(&self) -> &DataChembl {
94        &self.data
95    }
96
97    pub fn get_smiles_id_pairs(&self) -> (Vec<&String>, Vec<String>) {
98        (
99            self.data.values()
100                .map(|ec| &ec.smiles)
101                .collect(),
102            self.data.keys()
103                .map(|id| id.clone())
104                .collect()
105        )
106    }
107
108    pub fn len(&self) -> usize {
109        self.data.len()
110    }
111
112    pub fn choices(&self, size: usize) -> Vec<&EntryChembl> {
113        let mut rng = thread_rng();
114        let marks: Vec<bool> = (0..self.len())
115            .map(|_| rng.gen_range(0..self.len()) <= size * 2 )
116            .collect();
117
118        self.data.values().enumerate()
119            .filter(|(idx, _)| marks[*idx])
120            .map(|(_, v)| v)
121            .take(size)
122            .collect()
123    }
124}
125
126#[cfg(test)]
127mod test_chembl {
128    use super::*;
129
130    #[test]
131    fn test_source_chembl() {
132        let filepath = std::path::Path::new("../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt");
133        let sc = SourceChembl::new(&filepath);
134        assert_eq!(sc.len(), 100);
135        let ec = sc.get(&String::from("CHEMBL503634")).unwrap();
136        assert_eq!(ec.smiles, "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O");    
137        assert_eq!(ec.inchi, "InChI=1S/C15H15NO5/c1-21-15-12(19)7-11(18)13(14(15)20)10(16)6-8-2-4-9(17)5-3-8/h2-5,7,16-20H,6H2,1H3");
138        assert_eq!(ec.inchi_key, "OPELSESCRGGKAM-UHFFFAOYSA-N");
139
140        let data_all = sc.get_all();
141        assert_eq!(data_all.keys().count(), 100);
142
143        let selected = sc.choices(10);
144        assert_eq!(selected.len(), 10);
145    }
146}