chiral_db_sources/
chembl.rs1use std::io::prelude::*;
25use rand::prelude::*;
26
27type ChemblID = String;
28type CanonicalSMILES = String;
29type StandardInchi = String;
30type StandardInchiKey = String;
31
32pub struct EntryChembl {
33 pub chembl_id: ChemblID,
34 pub smiles: CanonicalSMILES,
35 pub inchi: StandardInchi,
36 pub inchi_key: StandardInchiKey
37}
38
39impl EntryChembl {
40 pub fn new(v: Vec<&str>) -> Self {
41 let (chembl_id, smiles, inchi, inchi_key) = (String::from(v[0]), String::from(v[1]), String::from(v[2]), String::from(v[3]));
42 Self { chembl_id, smiles, inchi, inchi_key }
43 }
44}
45
46type DataChembl = std::collections::HashMap<String, EntryChembl>;
47
48pub struct SourceChembl {
49 data: DataChembl
50}
51
52impl SourceChembl {
53 pub fn new(filepath: &std::path::Path) -> Self {
54 let mut sc = Self { data: DataChembl::new() };
55 sc.load(filepath);
56 sc
57 }
58
59 pub fn new_default() -> Self {
60 let mut sc = Self { data: DataChembl::new() };
61 let chembl_txt = std::env::var_os("CHIRAL_DB_CHEMBL_TXT").expect("CHIRAL_DB_CHEMBL_TXT to be set as the default source");
62 sc.load(&std::path::Path::new(&chembl_txt));
63 sc
64 }
65
66 fn sanitize(&mut self) {
67 self.data.remove("chembl_id");
68 }
69
70 pub fn load(&mut self, filepath: &std::path::Path) {
71 self.data.clear();
72
73 let file = std::fs::File::open(filepath).unwrap();
74 let reader = std::io::BufReader::new(file);
75 self.data = reader.lines()
76 .map(|l| {
77 let line = l.unwrap();
78 let v = line.as_str().split('\t').collect::<Vec<&str>>();
79 (String::from(v[0]), EntryChembl::new(v))
80 }
81 )
82 .collect::<Vec<(ChemblID, EntryChembl)>>()
83 .into_iter()
84 .collect();
85
86 self.sanitize();
87 }
88
89 pub fn get(&self, id: &ChemblID) -> Option<&EntryChembl> {
90 self.data.get(id)
91 }
92
93 pub fn get_all(&self) -> &DataChembl {
94 &self.data
95 }
96
97 pub fn get_smiles_id_pairs(&self) -> (Vec<&String>, Vec<String>) {
98 (
99 self.data.values()
100 .map(|ec| &ec.smiles)
101 .collect(),
102 self.data.keys()
103 .map(|id| id.clone())
104 .collect()
105 )
106 }
107
108 pub fn len(&self) -> usize {
109 self.data.len()
110 }
111
112 pub fn choices(&self, size: usize) -> Vec<&EntryChembl> {
113 let mut rng = thread_rng();
114 let marks: Vec<bool> = (0..self.len())
115 .map(|_| rng.gen_range(0..self.len()) <= size * 2 )
116 .collect();
117
118 self.data.values().enumerate()
119 .filter(|(idx, _)| marks[*idx])
120 .map(|(_, v)| v)
121 .take(size)
122 .collect()
123 }
124}
125
126#[cfg(test)]
127mod test_chembl {
128 use super::*;
129
130 #[test]
131 fn test_source_chembl() {
132 let filepath = std::path::Path::new("../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt");
133 let sc = SourceChembl::new(&filepath);
134 assert_eq!(sc.len(), 100);
135 let ec = sc.get(&String::from("CHEMBL503634")).unwrap();
136 assert_eq!(ec.smiles, "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O");
137 assert_eq!(ec.inchi, "InChI=1S/C15H15NO5/c1-21-15-12(19)7-11(18)13(14(15)20)10(16)6-8-2-4-9(17)5-3-8/h2-5,7,16-20H,6H2,1H3");
138 assert_eq!(ec.inchi_key, "OPELSESCRGGKAM-UHFFFAOYSA-N");
139
140 let data_all = sc.get_all();
141 assert_eq!(data_all.keys().count(), 100);
142
143 let selected = sc.choices(10);
144 assert_eq!(selected.len(), 10);
145 }
146}