chiral_common/data/source/
chembl.rs1use std::io::prelude::*;
6use rand::prelude::*;
7
8type ChemblID = String;
9type CanonicalSMILES = String;
10type StandardInchi = String;
11type StandardInchiKey = String;
12
13#[derive(PartialEq, Debug)]
14pub struct EntryChembl {
15 pub chembl_id: ChemblID,
16 pub smiles: CanonicalSMILES,
17 pub inchi: StandardInchi,
18 pub inchi_key: StandardInchiKey
19}
20
21impl EntryChembl {
22 pub fn new(v: Vec<&str>) -> Self {
23 let (chembl_id, smiles, inchi, inchi_key) = (String::from(v[0]), String::from(v[1]), String::from(v[2]), String::from(v[3]));
24 Self { chembl_id, smiles, inchi, inchi_key }
25 }
26}
27
28type DataChembl = std::collections::HashMap<ChemblID, EntryChembl>;
29
30pub struct SourceChembl {
31 path: std::path::PathBuf,
32 data: DataChembl
33}
34
35impl std::default::Default for SourceChembl {
36 fn default() -> Self {
37 let mut sc = Self::new();
38 let kind = crate::kinds::Dataset::Chembl30;
39 let chembl_txt = std::env::var_os(kind.env_key()).expect(format!("{} not set", kind.env_key()).as_str());
40 sc.set_path(&chembl_txt);
41 sc.load_all();
42 sc
43 }
44}
45
46impl SourceChembl {
47 pub fn new() -> Self {
48 Self {
49 path: std::path::PathBuf::new(),
50 data: DataChembl::new(),
51 }
52 }
53
54 pub fn set_path(&mut self, path_str: &std::ffi::OsStr) {
55 self.path = std::path::PathBuf::from(path_str)
56 }
57
58
59 fn sanitize(&mut self) {
60 self.data.remove("chembl_id");
61 }
62
63 fn convert_lines(&mut self, lines: impl std::iter::Iterator<Item = std::io::Result<String>>) {
64 self.data.clear();
65 self.data = lines.map(|l| {
66 let line = l.unwrap();
67 let v = line.as_str().split("\t").collect::<Vec<&str>>(); (String::from(v[0]), EntryChembl::new(v))
70 }
71 )
72 .collect::<Vec<(ChemblID, EntryChembl)>>()
73 .into_iter()
74 .collect();
75
76 self.sanitize();
77 }
78
79 pub fn load_all(&mut self) {
80 match std::fs::File::open(&self.path) {
81 Ok(file) => {
82 let lines = std::io::BufReader::new(file).lines();
83 self.convert_lines(lines);
84 },
85 Err(e) => crate::logging::error(format!("Error {} on file path: {:?}", e, self.path).as_str())
86 }
87 }
88
89 pub fn load_partial(&mut self, range: &std::ops::Range<usize>) {
90 let file = std::fs::File::open(&self.path).unwrap();
91 let reader = std::io::BufReader::new(file);
92 self.convert_lines(reader.lines().skip(range.start).take(range.len()));
93 }
94
95 pub fn get(&self, id: &ChemblID) -> Option<&EntryChembl> { self.data.get(id) }
96 pub fn get_all(&self) -> &DataChembl { &self.data }
97 pub fn len(&self) -> usize { self.data.len() }
98
99 pub fn get_smiles_id_pairs(&self) -> (Vec<String>, Vec<String>) {
100 (
101 self.data.values()
102 .map(|ec| ec.smiles.clone())
103 .collect(),
104 self.data.keys()
105 .map(|id| id.clone())
106 .collect()
107 )
108 }
109
110 pub fn choices(&self, size: usize) -> Vec<&EntryChembl> {
111 let mut rng = thread_rng();
112 let marks: Vec<bool> = (0..self.len())
113 .map(|_| rng.gen_range(0..self.len()) <= size * 2 )
114 .collect();
115
116 self.data.values().enumerate()
117 .filter(|(idx, _)| marks[*idx])
118 .map(|(_, v)| v)
119 .take(size)
120 .collect()
121 }
122}
123
124#[cfg(test)]
125mod test_chembl {
126 use super::*;
127
128 #[test]
129 fn test_source_chembl() {
130 let mut sc = SourceChembl::new();
131 sc.set_path(&std::ffi::OsString::from("../../../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt"));
132
133 sc.load_all();
135 assert_eq!(sc.len(), 100);
136 let ec = sc.get(&String::from("CHEMBL503634")).unwrap();
137 assert_eq!(ec.smiles, "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O");
138 assert_eq!(ec.inchi, "InChI=1S/C15H15NO5/c1-21-15-12(19)7-11(18)13(14(15)20)10(16)6-8-2-4-9(17)5-3-8/h2-5,7,16-20H,6H2,1H3");
139 assert_eq!(ec.inchi_key, "OPELSESCRGGKAM-UHFFFAOYSA-N");
140 let data_all = sc.get_all();
141 assert_eq!(data_all.keys().count(), 100);
142 let selected = sc.choices(10);
143 assert_eq!(selected.len(), 10);
144 sc.load_partial(&(30..40));
146 assert_eq!(sc.len(), 10);
147 assert_eq!(sc.get(&String::from("CHEMBL503634")), None);
148 let ec = sc.get(&String::from("CHEMBL501923")).unwrap();
149 assert_eq!(ec.smiles, "CC(C)=CCC/C(C)=C/Cc1c2c(c3oc4c(c(=O)c3c1O)CC1c3c(c(O)cc(O)c3-4)OC1(C)C)C=CC(C)(C)O2");
150 assert_eq!(ec.inchi, "InChI=1S/C35H38O7/c1-17(2)9-8-10-18(3)11-12-19-28(38)27-29(39)21-15-22-25-26(23(36)16-24(37)33(25)42-35(22,6)7)32(21)40-31(27)20-13-14-34(4,5)41-30(19)20/h9,11,13-14,16,22,36-38H,8,10,12,15H2,1-7H3/b18-11+");
151 assert_eq!(ec.inchi_key, "UJHMTIUPFDVYQA-WOJGMQOQSA-N");
152 }
153}