chiral_common/data/source/
chembl.rs

1//! Database ChEMBL
2//! https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/
3//! 
4
5use std::io::prelude::*;
6use rand::prelude::*;
7
8type ChemblID = String;
9type CanonicalSMILES = String;
10type StandardInchi = String;
11type StandardInchiKey = String;
12
13#[derive(PartialEq, Debug)]
14pub struct EntryChembl {
15    pub chembl_id: ChemblID,
16    pub smiles: CanonicalSMILES,
17    pub inchi: StandardInchi,
18    pub inchi_key: StandardInchiKey
19}
20
21impl EntryChembl {
22    pub fn new(v: Vec<&str>) -> Self {
23        let (chembl_id, smiles, inchi, inchi_key) = (String::from(v[0]), String::from(v[1]), String::from(v[2]), String::from(v[3]));
24        Self { chembl_id, smiles, inchi, inchi_key }
25    }
26}
27
28type DataChembl = std::collections::HashMap<ChemblID, EntryChembl>;
29
30pub struct SourceChembl {
31    path: std::path::PathBuf,
32    data: DataChembl
33}
34
35impl std::default::Default for SourceChembl {
36    fn default() -> Self {
37        let mut sc = Self::new();
38        let kind = crate::kinds::Dataset::Chembl30;
39        let chembl_txt = std::env::var_os(kind.env_key()).expect(format!("{} not set", kind.env_key()).as_str());
40        sc.set_path(&chembl_txt);
41        sc.load_all();
42        sc
43    }
44}
45
46impl SourceChembl {
47    pub fn new() -> Self {
48        Self {
49            path: std::path::PathBuf::new(),
50            data: DataChembl::new(),
51        }
52    }
53
54    pub fn set_path(&mut self, path_str: &std::ffi::OsStr) {
55        self.path = std::path::PathBuf::from(path_str)
56    }
57
58
59    fn sanitize(&mut self) {
60        self.data.remove("chembl_id");
61    }
62
63    fn convert_lines(&mut self, lines: impl std::iter::Iterator<Item = std::io::Result<String>>) {
64        self.data.clear();
65        self.data = lines.map(|l| {
66                let line = l.unwrap();
67                // let v = line.as_str().split_whitespace().collect::<Vec<&str>>();
68                let v = line.as_str().split("\t").collect::<Vec<&str>>(); // whitespace is not working for some entries like 1077164, inchi is blank
69                (String::from(v[0]), EntryChembl::new(v))
70            }
71        )
72        .collect::<Vec<(ChemblID, EntryChembl)>>()
73        .into_iter()
74        .collect();
75
76        self.sanitize();
77    }
78
79    pub fn load_all(&mut self) {
80        match std::fs::File::open(&self.path) {
81            Ok(file) => {
82                let lines = std::io::BufReader::new(file).lines();
83                self.convert_lines(lines);
84            },
85            Err(e) => crate::logging::error(format!("Error {} on file path: {:?}", e, self.path).as_str())
86        }
87    }
88
89    pub fn load_partial(&mut self, range: &std::ops::Range<usize>) {
90        let file = std::fs::File::open(&self.path).unwrap();
91        let reader = std::io::BufReader::new(file);
92        self.convert_lines(reader.lines().skip(range.start).take(range.len()));
93    }
94
95    pub fn get(&self, id: &ChemblID) -> Option<&EntryChembl> { self.data.get(id) }
96    pub fn get_all(&self) -> &DataChembl { &self.data }
97    pub fn len(&self) -> usize { self.data.len() }
98
99    pub fn get_smiles_id_pairs(&self) -> (Vec<String>, Vec<String>) {
100        (
101            self.data.values()
102                .map(|ec| ec.smiles.clone())
103                .collect(),
104            self.data.keys()
105                .map(|id| id.clone())
106                .collect()
107        )
108    }
109
110    pub fn choices(&self, size: usize) -> Vec<&EntryChembl> {
111        let mut rng = thread_rng();
112        let marks: Vec<bool> = (0..self.len())
113            .map(|_| rng.gen_range(0..self.len()) <= size * 2 )
114            .collect();
115
116        self.data.values().enumerate()
117            .filter(|(idx, _)| marks[*idx])
118            .map(|(_, v)| v)
119            .take(size)
120            .collect()
121    }
122}
123
124#[cfg(test)]
125mod test_chembl {
126    use super::*;
127
128    #[test]
129    fn test_source_chembl() {
130        let mut sc = SourceChembl::new();
131        sc.set_path(&std::ffi::OsString::from("../../../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt"));
132
133        // full load
134        sc.load_all();
135        assert_eq!(sc.len(), 100);
136        let ec = sc.get(&String::from("CHEMBL503634")).unwrap();
137        assert_eq!(ec.smiles, "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O");    
138        assert_eq!(ec.inchi, "InChI=1S/C15H15NO5/c1-21-15-12(19)7-11(18)13(14(15)20)10(16)6-8-2-4-9(17)5-3-8/h2-5,7,16-20H,6H2,1H3");
139        assert_eq!(ec.inchi_key, "OPELSESCRGGKAM-UHFFFAOYSA-N");
140        let data_all = sc.get_all();
141        assert_eq!(data_all.keys().count(), 100);
142        let selected = sc.choices(10);
143        assert_eq!(selected.len(), 10);
144        // partial load
145        sc.load_partial(&(30..40));
146        assert_eq!(sc.len(), 10);
147        assert_eq!(sc.get(&String::from("CHEMBL503634")), None);
148        let ec = sc.get(&String::from("CHEMBL501923")).unwrap();
149        assert_eq!(ec.smiles, "CC(C)=CCC/C(C)=C/Cc1c2c(c3oc4c(c(=O)c3c1O)CC1c3c(c(O)cc(O)c3-4)OC1(C)C)C=CC(C)(C)O2");    
150        assert_eq!(ec.inchi, "InChI=1S/C35H38O7/c1-17(2)9-8-10-18(3)11-12-19-28(38)27-29(39)21-15-22-25-26(23(36)16-24(37)33(25)42-35(22,6)7)32(21)40-31(27)20-13-14-34(4,5)41-30(19)20/h9,11,13-14,16,22,36-38H,8,10,12,15H2,1-7H3/b18-11+");
151        assert_eq!(ec.inchi_key, "UJHMTIUPFDVYQA-WOJGMQOQSA-N");
152    }
153}