chiral_common/data/doc/
smiles.rs

1//! Document SMILES
2//!     standardized database with id and SMILES
3//!     converted from other datasets
4
5use serde::{Serialize, Deserialize}; 
6use crate::traits::{Serialization, SerializedFormat};
7use chiral_derive::Serialization;
8
9#[derive(Serialize, Deserialize, Serialization)]
10pub struct DocSMILES {
11    ids: Vec<crate::data::types::EntryID>,
12    smiles: Vec<crate::app::chem::types::SMILES>,
13}
14
15impl DocSMILES {
16    pub fn empty() -> Self {
17        Self { ids: vec![], smiles: vec![] }
18    }
19
20    pub fn new(ids_in: Vec<crate::data::types::EntryID>, smiles_in: Vec<crate::app::chem::types::SMILES>) -> Self {
21        let mut perm = permutation::sort(&ids_in);
22        let mut ids = ids_in;
23        perm.apply_slice_in_place(&mut ids);
24        let mut smiles = smiles_in;
25        perm.apply_slice_in_place(&mut smiles);
26
27        Self { ids, smiles }
28    }
29
30    pub fn get_smiles(&self, id: &crate::data::types::EntryID) -> Option<&crate::app::chem::types::SMILES> {
31        match self.ids.binary_search(id) {
32            Ok(index) => self.smiles.get(index),
33            Err(_) => None
34        }
35    }
36
37    pub fn extract_ids(&self, range: &std::ops::Range<usize>) -> Vec<crate::data::types::EntryID> { self.get_ids().as_slice()[range.to_owned()].to_vec() }
38    pub fn extract_smiles_vec(&self, range: &std::ops::Range<usize>) -> Vec<crate::app::chem::types::SMILES> { self.get_smiles_vec().as_slice()[range.to_owned()].to_vec() }
39    pub fn extract(&self, range: &std::ops::Range<usize>) -> Self { Self::new(self.extract_ids(range), self.extract_smiles_vec(range)) }
40
41    pub fn get_smiles_vec(&self) -> &Vec<crate::app::chem::types::SMILES> { &self.smiles }
42    pub fn get_ids(&self) -> &Vec<crate::data::types::EntryID> { &self.ids }
43    pub fn len(&self) -> usize { self.ids.len() }
44}
45
46impl crate::data::Empty for DocSMILES {
47    fn empty() -> Self {
48        Self::new(vec![], vec![])
49    }
50}
51
52impl crate::data::Dummy for DocSMILES {
53    fn dummy() -> Self {
54        let ids = vec![
55            "label_1".to_string(),
56            "label_3".to_string(),
57            "label_2".to_string(),
58            "label_4".to_string()
59        ];
60        let smiles = vec![
61            String::from("O=C(C)Oc1ccccc1C(=O)O"),
62            String::from("N1=C(c3c(Sc2c1cccc2)cccc3)N4CCN(CCOCCO)CC4"),
63            String::from("O=C(O)C[C@H](O)C[C@H](O)CCn2c(c(c(c2c1ccc(F)cc1)c3ccccc3)C(=O)Nc4ccccc4)C(C)C"),
64            String::from("CC(=O)Nc1ccc(O)cc1")
65        ];
66
67        Self::new(ids, smiles)
68    }
69}
70
71impl From<crate::data::SourceChembl> for DocSMILES {
72    fn from(sc: crate::data::SourceChembl) -> Self {
73        let (smiles, ids) = sc.get_smiles_id_pairs();
74        DocSMILES::new(ids, smiles)
75    }
76}
77
78/// Datastore for DocSMILES
79pub type DocStoreSMILES = std::collections::HashMap<crate::kinds::Dataset, DocSMILES>;
80
81impl crate::data::Info for DocStoreSMILES {
82    fn info(&self) -> String {
83        format!("{:15} {:>15}\n", "name", "entries")
84        + vec!["=";31].join("").as_str() + "\n"
85        + self.iter()
86            .map(|(k, v)| format!("{:15} {:15}", k, v.len()))
87            .collect::<Vec<String>>()
88            .join("\n")
89            .as_str()
90    }
91}
92
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97    use crate::data::{Dummy, Info};
98
99    #[test]
100    fn test_doc_smiles() {
101        let doc = DocSMILES::dummy();
102        assert_eq!(doc.ids.len(), 4);
103        assert_eq!(doc.smiles.len(), 4);
104        assert_eq!(doc.get_smiles(&"label_3".to_string()), Some(&String::from("N1=C(c3c(Sc2c1cccc2)cccc3)N4CCN(CCOCCO)CC4")));
105        assert_eq!(doc.get_smiles(&"label_5".to_string()), None); 
106    }
107
108    #[test]
109    fn test_chembl() {
110        let mut sc = crate::data::SourceChembl::new();
111        sc.set_path(&std::ffi::OsString::from("../../../chiral-db-example-data/ChEMBL/chembl_30_chemreps_100.txt"));
112        sc.load_all();
113        let doc = DocSMILES::from(sc);
114        assert_eq!(doc.ids.len(), 100);
115        assert_eq!(doc.smiles.len(), 100);
116    }
117
118    #[test]
119    fn test_store() {
120        let mut store = DocStoreSMILES::new();
121        let doc_1 = DocSMILES::dummy();
122        store.insert(crate::kinds::Dataset::Dummy, doc_1);
123        let mut sc = crate::data::SourceChembl::new();
124        sc.set_path(&std::ffi::OsString::from("../../../chiral-db-example-data/ChEMBL/chembl_30_chemreps_10k.txt"));
125        sc.load_all();
126        let doc_2 = DocSMILES::from(sc);
127        store.insert(crate::kinds::Dataset::TestChembl, doc_2);
128        println!("{}", store.info());
129        assert!(store.info().contains("test_chembl"));
130        assert!(store.info().contains("dummy"));
131        assert!(store.info().contains("4"));
132        assert!(store.info().contains("100"));
133    }
134
135    #[test]
136    fn test_permutation() {
137        let dsk = crate::kinds::Dataset::TestChembl;
138        let filepath = std::path::PathBuf::from("../../../chiral-db-example-data/ChEMBL");
139        let doc = crate::data::load_from_path::<DocSMILES>(&dsk, &filepath);
140        let id = "CHEMBL10030".to_string();
141        assert_eq!(doc.get_smiles(&id).unwrap().to_string(), "O=C(c1ccc(OCCN2CCCC2)cc1)c1c(-c2ccc(O)cc2)sc2cc(O)ccc12".to_string());
142    }
143}